From 09fbc217d3bcbb7fd04432dfe9dd4cbf82984878 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Mar 2026 17:02:05 +0100 Subject: [PATCH] Add EMBL rope parsing support and improve sequence extraction Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries. --- pkg/obiformats/embl_read.go | 154 +++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 2 deletions(-) diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index d2fa9b1..5a87eeb 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise return parser } +// extractEmblSeq scans the sequence section of an EMBL record directly on the +// rope. EMBL sequence lines start with 5 spaces followed by bases in groups of +// 10, separated by spaces, with a position number at the end. The section ends +// with "//". +func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte { + // We use ReadLine and scan each line for bases (skip digits, spaces, newlines). + for { + line := s.ReadLine() + if line == nil { + break + } + if len(line) >= 2 && line[0] == '/' && line[1] == '/' { + break + } + // Lines start with 5 spaces; bases follow separated by single spaces. + // Digits at the end are the position counter — skip them. + // Simplest: take every byte that is a letter. + for _, b := range line { + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + if b >= 'a' && b <= 'z' { + dest = append(dest, b) + } + } + } + return dest +} + +// EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack(). +func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) { + scanner := newRopeScanner(rope) + sequences := obiseq.MakeBioSequenceSlice(100)[:0] + + var id string + var scientificName string + defBytes := make([]byte, 0, 256) + featBytes := make([]byte, 0, 1024) + var taxid int + inSeq := false + + for { + line := scanner.ReadLine() + if line == nil { + break + } + + if inSeq { + // Should not happen — extractEmblSeq consumed up to "//" + inSeq = false + continue + } + + switch { + case bytes.HasPrefix(line, []byte("ID ")): + id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0]) + case bytes.HasPrefix(line, []byte("OS ")): + scientificName = string(bytes.TrimSpace(line[5:])) + case bytes.HasPrefix(line, []byte("DE ")): + if len(defBytes) > 0 { + defBytes = append(defBytes, ' ') + } + defBytes = append(defBytes, bytes.TrimSpace(line[5:])...) + case withFeatureTable && bytes.HasPrefix(line, []byte("FH ")): + featBytes = append(featBytes, line...) + case withFeatureTable && bytes.Equal(line, []byte("FH")): + featBytes = append(featBytes, '\n') + featBytes = append(featBytes, line...) + case bytes.HasPrefix(line, []byte("FT ")): + if withFeatureTable { + featBytes = append(featBytes, '\n') + featBytes = append(featBytes, line...) + } + if bytes.HasPrefix(line, []byte(`FT /db_xref="taxon:`)) { + rest := line[37:] + end := bytes.IndexByte(rest, '"') + if end > 0 { + taxid, _ = strconv.Atoi(string(rest[:end])) + } + } + case bytes.HasPrefix(line, []byte(" ")): + // First sequence line: extract all bases via extractEmblSeq, + // which also consumes this line's remaining content. + // But ReadLine already consumed this line — we need to process it + // plus subsequent lines. Process this line inline then call helper. + seqDest := make([]byte, 0, 4096) + for _, b := range line { + if b >= 'A' && b <= 'Z' { + b += 'a' - 'A' + } + if UtoT && b == 'u' { + b = 't' + } + if b >= 'a' && b <= 'z' { + seqDest = append(seqDest, b) + } + } + seqDest = scanner.extractEmblSeq(seqDest, UtoT) + + seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes)) + seq.SetSource(source) + if withFeatureTable { + seq.SetFeatures(featBytes) + } + annot := seq.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, seq) + + // Reset state + id = "" + scientificName = "" + defBytes = defBytes[:0] + featBytes = featBytes[:0] + taxid = 1 + + case bytes.Equal(line, []byte("//")): + // record ended without SQ/sequence section (e.g. WGS entries) + if id != "" { + seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes)) + seq.SetSource(source) + if withFeatureTable { + seq.SetFeatures(featBytes) + } + annot := seq.Annotations() + annot["scientific_name"] = scientificName + annot["taxid"] = taxid + sequences = append(sequences, seq) + } + id = "" + scientificName = "" + defBytes = defBytes[:0] + featBytes = featBytes[:0] + taxid = 1 + } + } + + return sequences, nil +} + func _ParseEmblFile( input ChannelFileChunk, out obiiter.IBioSequence, @@ -171,7 +314,14 @@ func _ParseEmblFile( for chunks := range input { order := chunks.Order - sequences, err := parser(chunks.Source, chunks.Raw) + var sequences obiseq.BioSequenceSlice + var err error + + if chunks.Rope != nil { + sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT) + } else { + sequences, err = parser(chunks.Source, chunks.Raw) + } if err != nil { log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err) @@ -196,7 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er 1024*1024*128, EndOfLastFlatFileEntry, "\nID ", - true, + false, ) newIter := obiiter.MakeIBioSequence()