Add EMBL rope parsing support and improve sequence extraction

Introduce EmblChunkParserRope function to parse EMBL chunks directly from a rope without using Pack(). Add extractEmblSeq helper to scan sequence sections and handle U to T conversion. Update parser logic to use rope-based parsing when available, and fix feature table handling for WGS entries.
2026-06-24 01:31:00 +00:00 · 2026-03-10 17:02:05 +01:00
parent 3d2e205722
commit 09fbc217d3
1 changed files with 152 additions and 2 deletions
@@ -161,6 +161,149 @@ func EmblChunkParser(withFeatureTable, UtoT bool) func(string, io.Reader) (obise
 	return parser
 }
 // extractEmblSeq scans the sequence section of an EMBL record directly on the
 // rope. EMBL sequence lines start with 5 spaces followed by bases in groups of
 // 10, separated by spaces, with a position number at the end. The section ends
 // with "//".
 func (s *ropeScanner) extractEmblSeq(dest []byte, UtoT bool) []byte {
 	// We use ReadLine and scan each line for bases (skip digits, spaces, newlines).
 	for {
 		line := s.ReadLine()
 		if line == nil {
 			break
 		}
 		if len(line) >= 2 && line[0] == '/' && line[1] == '/' {
 			break
 		}
 		// Lines start with 5 spaces; bases follow separated by single spaces.
 		// Digits at the end are the position counter — skip them.
 		// Simplest: take every byte that is a letter.
 		for _, b := range line {
 			if b >= 'A' && b <= 'Z' {
 				b += 'a' - 'A'
 			}
 			if UtoT && b == 'u' {
 				b = 't'
 			}
 			if b >= 'a' && b <= 'z' {
 				dest = append(dest, b)
 			}
 		}
 	}
 	return dest
 }
 // EmblChunkParserRope parses an EMBL chunk directly from a rope without Pack().
 func EmblChunkParserRope(source string, rope *PieceOfChunk, withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
 	scanner := newRopeScanner(rope)
 	sequences := obiseq.MakeBioSequenceSlice(100)[:0]
 	var id string
 	var scientificName string
 	defBytes := make([]byte, 0, 256)
 	featBytes := make([]byte, 0, 1024)
 	var taxid int
 	inSeq := false
 	for {
 		line := scanner.ReadLine()
 		if line == nil {
 			break
 		}
 		if inSeq {
 			// Should not happen — extractEmblSeq consumed up to "//"
 			inSeq = false
 			continue
 		}
 		switch {
 		case bytes.HasPrefix(line, []byte("ID   ")):
 			id = string(bytes.SplitN(line[5:], []byte(";"), 2)[0])
 		case bytes.HasPrefix(line, []byte("OS   ")):
 			scientificName = string(bytes.TrimSpace(line[5:]))
 		case bytes.HasPrefix(line, []byte("DE   ")):
 			if len(defBytes) > 0 {
 				defBytes = append(defBytes, ' ')
 			}
 			defBytes = append(defBytes, bytes.TrimSpace(line[5:])...)
 		case withFeatureTable && bytes.HasPrefix(line, []byte("FH   ")):
 			featBytes = append(featBytes, line...)
 		case withFeatureTable && bytes.Equal(line, []byte("FH")):
 			featBytes = append(featBytes, '\n')
 			featBytes = append(featBytes, line...)
 		case bytes.HasPrefix(line, []byte("FT   ")):
 			if withFeatureTable {
 				featBytes = append(featBytes, '\n')
 				featBytes = append(featBytes, line...)
 			}
 			if bytes.HasPrefix(line, []byte(`FT                   /db_xref="taxon:`)) {
 				rest := line[37:]
 				end := bytes.IndexByte(rest, '"')
 				if end > 0 {
 					taxid, _ = strconv.Atoi(string(rest[:end]))
 				}
 			}
 		case bytes.HasPrefix(line, []byte("     ")):
 			// First sequence line: extract all bases via extractEmblSeq,
 			// which also consumes this line's remaining content.
 			// But ReadLine already consumed this line — we need to process it
 			// plus subsequent lines. Process this line inline then call helper.
 			seqDest := make([]byte, 0, 4096)
 			for _, b := range line {
 				if b >= 'A' && b <= 'Z' {
 					b += 'a' - 'A'
 				}
 				if UtoT && b == 'u' {
 					b = 't'
 				}
 				if b >= 'a' && b <= 'z' {
 					seqDest = append(seqDest, b)
 				}
 			}
 			seqDest = scanner.extractEmblSeq(seqDest, UtoT)
 			seq := obiseq.NewBioSequenceOwning(id, seqDest, string(defBytes))
 			seq.SetSource(source)
 			if withFeatureTable {
 				seq.SetFeatures(featBytes)
 			}
 			annot := seq.Annotations()
 			annot["scientific_name"] = scientificName
 			annot["taxid"] = taxid
 			sequences = append(sequences, seq)
 			// Reset state
 			id = ""
 			scientificName = ""
 			defBytes = defBytes[:0]
 			featBytes = featBytes[:0]
 			taxid = 1
 		case bytes.Equal(line, []byte("//")):
 			// record ended without SQ/sequence section (e.g. WGS entries)
 			if id != "" {
 				seq := obiseq.NewBioSequenceOwning(id, []byte{}, string(defBytes))
 				seq.SetSource(source)
 				if withFeatureTable {
 					seq.SetFeatures(featBytes)
 				}
 				annot := seq.Annotations()
 				annot["scientific_name"] = scientificName
 				annot["taxid"] = taxid
 				sequences = append(sequences, seq)
 			}
 			id = ""
 			scientificName = ""
 			defBytes = defBytes[:0]
 			featBytes = featBytes[:0]
 			taxid = 1
 		}
 	}
 	return sequences, nil
 }
 func _ParseEmblFile(
 	input ChannelFileChunk,
 	out obiiter.IBioSequence,
@@ -171,7 +314,14 @@ func _ParseEmblFile(
 	for chunks := range input {
 		order := chunks.Order
-		sequences, err := parser(chunks.Source, chunks.Raw)
+		var sequences obiseq.BioSequenceSlice
 		var err error
 		if chunks.Rope != nil {
 			sequences, err = EmblChunkParserRope(chunks.Source, chunks.Rope, withFeatureTable, UtoT)
 		} else {
 			sequences, err = parser(chunks.Source, chunks.Raw)
 		}
 		if err != nil {
 			log.Fatalf("%s : Cannot parse the embl file : %v", chunks.Source, err)
@@ -196,7 +346,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
 		1024*1024*128,
 		EndOfLastFlatFileEntry,
 		"\nID   ",
-		true,
+		false,
 	)
 	newIter := obiiter.MakeIBioSequence()