From 1542ce4c63021e61c28da139d50af54f14bf5def Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric.coissac@metabarcoding.org>
Date: Tue, 20 Feb 2024 13:23:07 +0100
Subject: [PATCH] Work on EMBL and Genbank parser efficienct

Former-commit-id: 309cc9ce4eea4c8085d7d4451a66a81710532f07
---
 Release-notes.md               |  8 ++++++++
 pkg/obiformats/embl_read.go    | 26 ++++++++++++++++++--------
 pkg/obiformats/genbank_read.go | 21 +++++++++++++++------
 pkg/obiformats/options.go      | 28 ++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/Release-notes.md b/Release-notes.md
index 4b242a1..f6476c0 100644
--- a/Release-notes.md
+++ b/Release-notes.md
@@ -2,6 +2,14 @@
 
 ## Latest changes
 
+### Enhancement
+-   The bug corrected in the parsing of EMBL and Genbank files as implemented in version 4.1.2 of OBITools4,
+    potentially induced some reduction in the performance of the parsing. This should have been now fixed.
+-   In the same idea, parsing of genbank and EMBL files were reading and storing in memory not only the sequence
+    but also the annotations (features table). Up to now none of the obitools are using this information, but
+    with large complete genomes, it is occupying a lot of memory. To reduce this impact, the new version of the
+    parser doesn't any more store in memory the annotations by default.
+
 ## February 16th, 2024. Release 4.1.2
 
 ### Bug fixes
diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go
index 55e0054..75c6701 100644
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@@ -94,7 +94,11 @@ func _EndOfLastEntry(buff []byte) int {
 	return -1
 }
 
-func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequence) {
+func _ParseEmblFile(source string, input <-chan _FileChunk,
+	out obiiter.IBioSequence,
+	withFeatureTable bool,
+	batch_size int,
+	total_seq_size int) {
 
 	for chunks := range input {
 		scanner := bufio.NewScanner(chunks.raw)
@@ -120,14 +124,16 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
 					defBytes.WriteByte(' ')
 				}
 				defBytes.WriteString(strings.TrimSpace(line[5:]))
-			case strings.HasPrefix(line, "FH   "):
+			case withFeatureTable && strings.HasPrefix(line, "FH   "):
 				featBytes.WriteString(line)
-			case line == "FH":
+			case withFeatureTable && line == "FH":
 				featBytes.WriteByte('\n')
 				featBytes.WriteString(line)
 			case strings.HasPrefix(line, "FT   "):
-				featBytes.WriteByte('\n')
-				featBytes.WriteString(line)
+				if withFeatureTable {
+					featBytes.WriteByte('\n')
+					featBytes.WriteString(line)
+				}
 				if strings.HasPrefix(line, `FT                   /db_xref="taxon:`) {
 					taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
 				}
@@ -143,7 +149,9 @@ func _ParseEmblFile(source string, input <-chan _FileChunk, out obiiter.IBioSequ
 					defBytes.String())
 				sequence.SetSource(source)
 
-				sequence.SetFeatures(featBytes.Bytes())
+				if withFeatureTable {
+					sequence.SetFeatures(featBytes.Bytes())
+				}
 
 				annot := sequence.Annotations()
 				annot["scientific_name"] = scientificName
@@ -198,7 +206,7 @@ func _ReadFlatFileChunk(reader io.Reader, readers chan _FileChunk) {
 	for err == nil {
 
 		// Create an extended buffer to read from if the end of the last entry is not found in the current buffer
-		extbuff := make([]byte, 1<<22)
+		extbuff := make([]byte, _FileChunkSize)
 		end := 0
 		ic := 0
 
@@ -278,7 +286,9 @@ func ReadEMBL(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
 
 	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
-		go _ParseEmblFile(opt.Source(), entry_channel, newIter)
+		go _ParseEmblFile(opt.Source(), entry_channel, newIter,
+			opt.WithFeatureTable(),
+			opt.BatchSize(), opt.TotalSeqSize())
 	}
 
 	go _ReadFlatFileChunk(reader, entry_channel)
diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go
index 502d1a9..925d55e 100644
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@@ -30,7 +30,10 @@ var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
 
 func _ParseGenbankFile(source string,
 	input <-chan _FileChunk, out obiiter.IBioSequence,
-	chunck_order func() int) {
+	chunck_order func() int,
+	withFeatureTable bool,
+	batch_size int,
+	total_seq_size int) {
 	state := inHeader
 	previous_chunk := -1
 
@@ -143,7 +146,10 @@ func _ParseGenbankFile(source string,
 						seqBytes.Bytes(),
 						defBytes.String())
 					sequence.SetSource(source)
-					sequence.SetFeatures(featBytes.Bytes())
+
+					if withFeatureTable {
+						sequence.SetFeatures(featBytes.Bytes())
+					}
 
 					annot := sequence.Annotations()
 					annot["scientific_name"] = scientificName
@@ -155,7 +161,7 @@ func _ParseGenbankFile(source string,
 					sequences = append(sequences, sequence)
 					sumlength += sequence.Len()
 
-					if len(sequences) == 100 || sumlength > 1e7 {
+					if len(sequences) == batch_size || sumlength > total_seq_size {
 						log.Debugln("Pushing sequences")
 						out.Push(obiiter.MakeBioSequenceBatch(chunck_order(), sequences))
 						sequences = make(obiseq.BioSequenceSlice, 0, 100)
@@ -184,8 +190,10 @@ func _ParseGenbankFile(source string,
 				default:
 					switch state {
 					case inFeature:
-						featBytes.WriteByte('\n')
-						featBytes.WriteString(line)
+						if withFeatureTable {
+							featBytes.WriteByte('\n')
+							featBytes.WriteString(line)
+						}
 						if strings.HasPrefix(line, `                     /db_xref="taxon:`) {
 							taxid, _ = strconv.Atoi(strings.SplitN(line[37:], `"`, 2)[0])
 						}
@@ -227,7 +235,8 @@ func ReadGenbank(reader io.Reader, options ...WithOption) obiiter.IBioSequence {
 
 	// for j := 0; j < opt.ParallelWorkers(); j++ {
 	for j := 0; j < nworkers; j++ {
-		go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order)
+		go _ParseGenbankFile(opt.Source(), entry_channel, newIter, chunck_order,
+			opt.WithFeatureTable(), opt.BatchSize(), opt.TotalSeqSize())
 	}
 
 	go _ReadFlatFileChunk(reader, entry_channel)
diff --git a/pkg/obiformats/options.go b/pkg/obiformats/options.go
index bb7a72e..6666e3c 100644
--- a/pkg/obiformats/options.go
+++ b/pkg/obiformats/options.go
@@ -11,6 +11,7 @@ type __options__ struct {
 	with_progress_bar     bool
 	buffer_size           int
 	batch_size            int
+	total_seq_size        int
 	full_file_batch       bool
 	parallel_workers      int
 	closefile             bool
@@ -29,6 +30,7 @@ type __options__ struct {
 	csv_auto              bool
 	paired_filename       string
 	source                string
+	with_feature_table    bool
 }
 
 type Options struct {
@@ -45,6 +47,7 @@ func MakeOptions(setters []WithOption) Options {
 		buffer_size:           2,
 		parallel_workers:      obioptions.CLIReadParallelWorkers(),
 		batch_size:            obioptions.CLIBatchSize(),
+		total_seq_size:        1024 * 1024 * 100, // 100 MB by default
 		full_file_batch:       false,
 		closefile:             false,
 		appendfile:            false,
@@ -62,6 +65,7 @@ func MakeOptions(setters []WithOption) Options {
 		csv_auto:              false,
 		paired_filename:       "",
 		source:                "",
+		with_feature_table:    false,
 	}
 
 	opt := Options{&o}
@@ -77,6 +81,10 @@ func (opt Options) BatchSize() int {
 	return opt.pointer.batch_size
 }
 
+func (opt Options) TotalSeqSize() int {
+	return opt.pointer.total_seq_size
+}
+
 func (opt Options) FullFileBatch() bool {
 	return opt.pointer.full_file_batch
 }
@@ -169,6 +177,10 @@ func (opt Options) Source() string {
 	return opt.pointer.source
 }
 
+func (opt Options) WithFeatureTable() bool {
+	return opt.pointer.with_feature_table
+}
+
 func OptionCloseFile() WithOption {
 	f := WithOption(func(opt Options) {
 		opt.pointer.closefile = true
@@ -259,6 +271,14 @@ func OptionsBatchSize(size int) WithOption {
 	return f
 }
 
+func OptionsBatchSizeDefault(bp int) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.batch_size = bp
+	})
+
+	return f
+}
+
 func OptionsFullFileBatch(full bool) WithOption {
 	f := WithOption(func(opt Options) {
 		opt.pointer.full_file_batch = full
@@ -386,3 +406,11 @@ func CSVAutoColumn(auto bool) WithOption {
 
 	return f
 }
+
+func WithFeatureTable(with bool) WithOption {
+	f := WithOption(func(opt Options) {
+		opt.pointer.with_feature_table = with
+	})
+
+	return f
+}