From 1542ce4c63021e61c28da139d50af54f14bf5def Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 20 Feb 2024 13:23:07 +0100 Subject: [PATCH] Work on EMBL and Genbank parser efficienct Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07 --- Release-notes.md | 8 ++++++++ pkg/obiformats/embl_read.go | 26 ++++++++++++++++++-------- pkg/obiformats/genbank_read.go | 21 +++++++++++++++------ pkg/obiformats/options.go | 28 ++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 14 deletions(-) diff --git a/Release-notes.md b/Release-notes.md index 4b242a1..f6476c0 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -2,6 +2,14 @@ ## Latest changes +### Enhancement +- The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4, + potentially induced some reduction in the performance of the parsing. This should have been now fixed. +- In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence + but also the annotations (features table). Up to now none of the obitools are using this information, but + with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the + parser doesn't any more store in memory the annotations by default. + ## February 16th, 2024. Release 4.1.2 ### Bug fixes diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index 55e0054..75c6701 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -94,7 +94,11 @@ func _EndOfLastEntry(buff []byte) int { return -1 } -func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) { +func _ParseEmblFile(source string, input <-chan _FileChunk, + out obiiter.IBioSequence, + withFeatureTable bool, + batch_size int, + total_seq_size int) { for chunks := range input { scanner := bufio.NewScanner(chunks.raw) @@ -120,14 +124,16 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ defBytes.WriteByte(' ') } defBytes.WriteString(strings.TrimSpace(line[5:])) - case strings.HasPrefix(line, "FH "): + case withFeatureTable && strings.HasPrefix(line, "FH "): featBytes.WriteString(line) - case line == "FH": + case withFeatureTable && line == "FH": featBytes.WriteByte('\n') featBytes.WriteString(line) case strings.HasPrefix(line, "FT "): - featBytes.WriteByte('\n') - featBytes.WriteString(line) + if withFeatureTable { + featBytes.WriteByte('\n') + featBytes.WriteString(line) + } if strings.HasPrefix(line, `FT /db_xref="taxon:`) { taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) } @@ -143,7 +149,9 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ defBytes.String()) sequence.SetSource(source) - sequence.SetFeatures(featBytes.Bytes()) + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } annot := sequence.Annotations() annot["scientific_name"] = scientificName @@ -198,7 +206,7 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) { for err == nil { // Create an extended buffer to read from if the end of the last entry is not found in the current buffer - extbuff := make([]byte, 1<<22) + extbuff := make([]byte, _FileChunkSize) end := 0 ic := 0 @@ -278,7 +286,9 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence { // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { - go _ParseEmblFile(opt.Source(), entry_channel, newIter) + go _ParseEmblFile(opt.Source(), entry_channel, newIter, + opt.WithFeatureTable(), + opt.BatchSize(), opt.TotalSeqSize()) } go _ReadFlatFileChunk(reader, entry_channel) diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index 502d1a9..925d55e 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -30,7 +30,10 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp") func _ParseGenbankFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence, - chunck_order func() int) { + chunck_order func() int, + withFeatureTable bool, + batch_size int, + total_seq_size int) { state := inHeader previous_chunk := -1 @@ -143,7 +146,10 @@ func _ParseGenbankFile(source string, seqBytes.Bytes(), defBytes.String()) sequence.SetSource(source) - sequence.SetFeatures(featBytes.Bytes()) + + if withFeatureTable { + sequence.SetFeatures(featBytes.Bytes()) + } annot := sequence.Annotations() annot["scientific_name"] = scientificName @@ -155,7 +161,7 @@ func _ParseGenbankFile(source string, sequences = append(sequences, sequence) sumlength += sequence.Len() - if len(sequences) == 100 || sumlength > 1e7 { + if len(sequences) == batch_size || sumlength > total_seq_size { log.Debugln("Pushing sequences") out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences)) sequences = make(obiseq.BioSequenceSlice, 0, 100) @@ -184,8 +190,10 @@ func _ParseGenbankFile(source string, default: switch state { case inFeature: - featBytes.WriteByte('\n') - featBytes.WriteString(line) + if withFeatureTable { + featBytes.WriteByte('\n') + featBytes.WriteString(line) + } if strings.HasPrefix(line, ` /db_xref="taxon:`) { taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0]) } @@ -227,7 +235,8 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence { // for j := 0; j < opt.ParallelWorkers(); j++ { for j := 0; j < nworkers; j++ { - go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order) + go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order, + opt.WithFeatureTable(), opt.BatchSize(), opt.TotalSeqSize()) } go _ReadFlatFileChunk(reader, entry_channel) diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go index bb7a72e..6666e3c 100644 --- a/pkg/obiformats/options.go +++ b/pkg/obiformats/options.go @@ -11,6 +11,7 @@ type __options__ struct { with_progress_bar bool buffer_size int batch_size int + total_seq_size int full_file_batch bool parallel_workers int closefile bool @@ -29,6 +30,7 @@ type __options__ struct { csv_auto bool paired_filename string source string + with_feature_table bool } type Options struct { @@ -45,6 +47,7 @@ func MakeOptions(setters []WithOption) Options { buffer_size: 2, parallel_workers: obioptions.CLIReadParallelWorkers(), batch_size: obioptions.CLIBatchSize(), + total_seq_size: 1024 * 1024 * 100, // 100 MB by default full_file_batch: false, closefile: false, appendfile: false, @@ -62,6 +65,7 @@ func MakeOptions(setters []WithOption) Options { csv_auto: false, paired_filename: "", source: "", + with_feature_table: false, } opt := Options{&o} @@ -77,6 +81,10 @@ func (opt Options) BatchSize() int { return opt.pointer.batch_size } +func (opt Options) TotalSeqSize() int { + return opt.pointer.total_seq_size +} + func (opt Options) FullFileBatch() bool { return opt.pointer.full_file_batch } @@ -169,6 +177,10 @@ func (opt Options) Source() string { return opt.pointer.source } +func (opt Options) WithFeatureTable() bool { + return opt.pointer.with_feature_table +} + func OptionCloseFile() WithOption { f := WithOption(func(opt Options) { opt.pointer.closefile = true @@ -259,6 +271,14 @@ func OptionsBatchSize(size int) WithOption { return f } +func OptionsBatchSizeDefault(bp int) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.batch_size = bp + }) + + return f +} + func OptionsFullFileBatch(full bool) WithOption { f := WithOption(func(opt Options) { opt.pointer.full_file_batch = full @@ -386,3 +406,11 @@ func CSVAutoColumn(auto bool) WithOption { return f } + +func WithFeatureTable(with bool) WithOption { + f := WithOption(func(opt Options) { + opt.pointer.with_feature_table = with + }) + + return f +}