From 09fbc217d3bcbb7fd04432dfe9dd4cbf82984878 Mon Sep 17 00:00:00 2001
From: Eric Coissac <eric@coissac.eu>
Date: Tue, 10 Mar 2026 17:02:05 +0100
Subject: [PATCH] Add EMBL rope parsing support and improve sequence extraction

Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries.
---
 pkg/obiformats/embl_read.go | 154 +++++++++++++++++++++++++++++++++++-
 1 file changed, 152 insertions(+), 2 deletions(-)

diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go
index d2fa9b1..5a87eeb 100644
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
 	return parser
 }
 
+// extractEmblSeq scans the sequence section of an EMBL record directly on the
+// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
+// 10, separated by spaces, with a position number at the end. The section ends
+// with "//".
+func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
+	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
+	for {
+		line := s.ReadLine()
+		if line == nil {
+			break
+		}
+		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
+			break
+		}
+		// Lines start with 5 spaces; bases follow separated by single spaces.
+		// Digits at the end are the position counter — skip them.
+		// Simplest: take every byte that is a letter.
+		for _, b := range line {
+			if b >= 'A' && b <= 'Z' {
+				b += 'a' - 'A'
+			}
+			if UtoT && b == 'u' {
+				b = 't'
+			}
+			if b >= 'a' && b <= 'z' {
+				dest = append(dest, b)
+			}
+		}
+	}
+	return dest
+}
+
+// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
+func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
+	scanner := newRopeScanner(rope)
+	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
+
+	var id string
+	var scientificName string
+	defBytes := make([]byte, 0, 256)
+	featBytes := make([]byte, 0, 1024)
+	var taxid int
+	inSeq := false
+
+	for {
+		line := scanner.ReadLine()
+		if line == nil {
+			break
+		}
+
+		if inSeq {
+			// Should not happen — extractEmblSeq consumed up to "//"
+			inSeq = false
+			continue
+		}
+
+		switch {
+		case bytes.HasPrefix(line, []byte("ID   ")):
+			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
+		case bytes.HasPrefix(line, []byte("OS   ")):
+			scientificName = string(bytes.TrimSpace(line[5:]))
+		case bytes.HasPrefix(line, []byte("DE   ")):
+			if len(defBytes) > 0 {
+				defBytes = append(defBytes, ' ')
+			}
+			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
+		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
+			featBytes = append(featBytes, line...)
+		case withFeatureTable && bytes.Equal(line, []byte("FH")):
+			featBytes = append(featBytes, '\n')
+			featBytes = append(featBytes, line...)
+		case bytes.HasPrefix(line, []byte("FT   ")):
+			if withFeatureTable {
+				featBytes = append(featBytes, '\n')
+				featBytes = append(featBytes, line...)
+			}
+			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
+				rest := line[37:]
+				end := bytes.IndexByte(rest, '"')
+				if end > 0 {
+					taxid, _ = strconv.Atoi(string(rest[:end]))
+				}
+			}
+		case bytes.HasPrefix(line, []byte("     ")):
+			// First sequence line: extract all bases via extractEmblSeq,
+			// which also consumes this line's remaining content.
+			// But ReadLine already consumed this line — we need to process it
+			// plus subsequent lines. Process this line inline then call helper.
+			seqDest := make([]byte, 0, 4096)
+			for _, b := range line {
+				if b >= 'A' && b <= 'Z' {
+					b += 'a' - 'A'
+				}
+				if UtoT && b == 'u' {
+					b = 't'
+				}
+				if b >= 'a' && b <= 'z' {
+					seqDest = append(seqDest, b)
+				}
+			}
+			seqDest = scanner.extractEmblSeq(seqDest, UtoT)
+
+			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
+			seq.SetSource(source)
+			if withFeatureTable {
+				seq.SetFeatures(featBytes)
+			}
+			annot := seq.Annotations()
+			annot["scientific_name"] = scientificName
+			annot["taxid"] = taxid
+			sequences = append(sequences, seq)
+
+			// Reset state
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+
+		case bytes.Equal(line, []byte("//")):
+			// record ended without SQ/sequence section (e.g. WGS entries)
+			if id != "" {
+				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
+				seq.SetSource(source)
+				if withFeatureTable {
+					seq.SetFeatures(featBytes)
+				}
+				annot := seq.Annotations()
+				annot["scientific_name"] = scientificName
+				annot["taxid"] = taxid
+				sequences = append(sequences, seq)
+			}
+			id = ""
+			scientificName = ""
+			defBytes = defBytes[:0]
+			featBytes = featBytes[:0]
+			taxid = 1
+		}
+	}
+
+	return sequences, nil
+}
+
 func _ParseEmblFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -171,7 +314,14 @@ func _ParseEmblFile(
 
 	for chunks := range input {
 		order := chunks.Order
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
+		var err error
+
+		if chunks.Rope != nil {
+			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
+		} else {
+			sequences, err = parser(chunks.Source, chunks.Raw)
+		}
 
 		if err != nil {
 			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
@@ -196,7 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
-		true,
+		false,
 	)
 
 	newIter := obiiter.MakeIBioSequence()