Work on EMBL and Genbank parser efficienct

Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07
This commit is contained in:
2024-02-20 13:23:07 +01:00
parent 95caebec8f
commit 1542ce4c63
4 changed files with 69 additions and 14 deletions

View File

@ -2,6 +2,14 @@
## Latest changes ## Latest changes
### Enhancement
- The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4,
potentially induced some reduction in the performance of the parsing. This should have been now fixed.
- In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence
but also the annotations (features table). Up to now none of the obitools are using this information, but
with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the
parser doesn't any more store in memory the annotations by default.
## February 16th, 2024. Release 4.1.2 ## February 16th, 2024. Release 4.1.2
### Bug fixes ### Bug fixes

View File

@ -94,7 +94,11 @@ func _EndOfLastEntry(buff []byte) int {
return -1 return -1
} }
func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) { func _ParseEmblFile(source string, input <-chan _FileChunk,
out obiiter.IBioSequence,
withFeatureTable bool,
batch_size int,
total_seq_size int) {
for chunks := range input { for chunks := range input {
scanner := bufio.NewScanner(chunks.raw) scanner := bufio.NewScanner(chunks.raw)
@ -120,14 +124,16 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
defBytes.WriteByte(' ') defBytes.WriteByte(' ')
} }
defBytes.WriteString(strings.TrimSpace(line[5:])) defBytes.WriteString(strings.TrimSpace(line[5:]))
case strings.HasPrefix(line, "FH "): case withFeatureTable && strings.HasPrefix(line, "FH "):
featBytes.WriteString(line) featBytes.WriteString(line)
case line == "FH": case withFeatureTable && line == "FH":
featBytes.WriteByte('\n') featBytes.WriteByte('\n')
featBytes.WriteString(line) featBytes.WriteString(line)
case strings.HasPrefix(line, "FT "): case strings.HasPrefix(line, "FT "):
if withFeatureTable {
featBytes.WriteByte('\n') featBytes.WriteByte('\n')
featBytes.WriteString(line) featBytes.WriteString(line)
}
if strings.HasPrefix(line, `FT /db_xref="taxon:`) { if strings.HasPrefix(line, `FT /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
} }
@ -143,7 +149,9 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
defBytes.String()) defBytes.String())
sequence.SetSource(source) sequence.SetSource(source)
if withFeatureTable {
sequence.SetFeatures(featBytes.Bytes()) sequence.SetFeatures(featBytes.Bytes())
}
annot := sequence.Annotations() annot := sequence.Annotations()
annot["scientific_name"] = scientificName annot["scientific_name"] = scientificName
@ -198,7 +206,7 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
for err == nil { for err == nil {
// Create an extended buffer to read from if the end of the last entry is not found in the current buffer // Create an extended buffer to read from if the end of the last entry is not found in the current buffer
extbuff := make([]byte, 1<<22) extbuff := make([]byte, _FileChunkSize)
end := 0 end := 0
ic := 0 ic := 0
@ -278,7 +286,9 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
// for j := 0; j < opt.ParallelWorkers(); j++ { // for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ { for j := 0; j < nworkers; j++ {
go _ParseEmblFile(opt.Source(), entry_channel, newIter) go _ParseEmblFile(opt.Source(), entry_channel, newIter,
opt.WithFeatureTable(),
opt.BatchSize(), opt.TotalSeqSize())
} }
go _ReadFlatFileChunk(reader, entry_channel) go _ReadFlatFileChunk(reader, entry_channel)

View File

@ -30,7 +30,10 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
func _ParseGenbankFile(source string, func _ParseGenbankFile(source string,
input <-chan _FileChunk, out obiiter.IBioSequence, input <-chan _FileChunk, out obiiter.IBioSequence,
chunck_order func() int) { chunck_order func() int,
withFeatureTable bool,
batch_size int,
total_seq_size int) {
state := inHeader state := inHeader
previous_chunk := -1 previous_chunk := -1
@ -143,7 +146,10 @@ func _ParseGenbankFile(source string,
seqBytes.Bytes(), seqBytes.Bytes(),
defBytes.String()) defBytes.String())
sequence.SetSource(source) sequence.SetSource(source)
if withFeatureTable {
sequence.SetFeatures(featBytes.Bytes()) sequence.SetFeatures(featBytes.Bytes())
}
annot := sequence.Annotations() annot := sequence.Annotations()
annot["scientific_name"] = scientificName annot["scientific_name"] = scientificName
@ -155,7 +161,7 @@ func _ParseGenbankFile(source string,
sequences = append(sequences, sequence) sequences = append(sequences, sequence)
sumlength += sequence.Len() sumlength += sequence.Len()
if len(sequences) == 100 || sumlength > 1e7 { if len(sequences) == batch_size || sumlength > total_seq_size {
log.Debugln("Pushing sequences") log.Debugln("Pushing sequences")
out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
sequences = make(obiseq.BioSequenceSlice, 0, 100) sequences = make(obiseq.BioSequenceSlice, 0, 100)
@ -184,8 +190,10 @@ func _ParseGenbankFile(source string,
default: default:
switch state { switch state {
case inFeature: case inFeature:
if withFeatureTable {
featBytes.WriteByte('\n') featBytes.WriteByte('\n')
featBytes.WriteString(line) featBytes.WriteString(line)
}
if strings.HasPrefix(line, ` /db_xref="taxon:`) { if strings.HasPrefix(line, ` /db_xref="taxon:`) {
taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
} }
@ -227,7 +235,8 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
// for j := 0; j < opt.ParallelWorkers(); j++ { // for j := 0; j < opt.ParallelWorkers(); j++ {
for j := 0; j < nworkers; j++ { for j := 0; j < nworkers; j++ {
go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order) go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order,
opt.WithFeatureTable(), opt.BatchSize(), opt.TotalSeqSize())
} }
go _ReadFlatFileChunk(reader, entry_channel) go _ReadFlatFileChunk(reader, entry_channel)

View File

@ -11,6 +11,7 @@ type __options__ struct {
with_progress_bar bool with_progress_bar bool
buffer_size int buffer_size int
batch_size int batch_size int
total_seq_size int
full_file_batch bool full_file_batch bool
parallel_workers int parallel_workers int
closefile bool closefile bool
@ -29,6 +30,7 @@ type __options__ struct {
csv_auto bool csv_auto bool
paired_filename string paired_filename string
source string source string
with_feature_table bool
} }
type Options struct { type Options struct {
@ -45,6 +47,7 @@ func MakeOptions(setters []WithOption) Options {
buffer_size: 2, buffer_size: 2,
parallel_workers: obioptions.CLIReadParallelWorkers(), parallel_workers: obioptions.CLIReadParallelWorkers(),
batch_size: obioptions.CLIBatchSize(), batch_size: obioptions.CLIBatchSize(),
total_seq_size: 1024 * 1024 * 100, // 100 MB by default
full_file_batch: false, full_file_batch: false,
closefile: false, closefile: false,
appendfile: false, appendfile: false,
@ -62,6 +65,7 @@ func MakeOptions(setters []WithOption) Options {
csv_auto: false, csv_auto: false,
paired_filename: "", paired_filename: "",
source: "", source: "",
with_feature_table: false,
} }
opt := Options{&o} opt := Options{&o}
@ -77,6 +81,10 @@ func (opt Options) BatchSize() int {
return opt.pointer.batch_size return opt.pointer.batch_size
} }
func (opt Options) TotalSeqSize() int {
return opt.pointer.total_seq_size
}
func (opt Options) FullFileBatch() bool { func (opt Options) FullFileBatch() bool {
return opt.pointer.full_file_batch return opt.pointer.full_file_batch
} }
@ -169,6 +177,10 @@ func (opt Options) Source() string {
return opt.pointer.source return opt.pointer.source
} }
func (opt Options) WithFeatureTable() bool {
return opt.pointer.with_feature_table
}
func OptionCloseFile() WithOption { func OptionCloseFile() WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.closefile = true opt.pointer.closefile = true
@ -259,6 +271,14 @@ func OptionsBatchSize(size int) WithOption {
return f return f
} }
func OptionsBatchSizeDefault(bp int) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.batch_size = bp
})
return f
}
func OptionsFullFileBatch(full bool) WithOption { func OptionsFullFileBatch(full bool) WithOption {
f := WithOption(func(opt Options) { f := WithOption(func(opt Options) {
opt.pointer.full_file_batch = full opt.pointer.full_file_batch = full
@ -386,3 +406,11 @@ func CSVAutoColumn(auto bool) WithOption {
return f return f
} }
func WithFeatureTable(with bool) WithOption {
f := WithOption(func(opt Options) {
opt.pointer.with_feature_table = with
})
return f
}