Refactor rope scanner and add FASTQ rope parser

This commit refactors the rope scanner implementation by renaming gbRopeScanner to ropeScanner and extracting the common functionality into a new file. It also introduces a new FastqChunkParserRope function that parses FASTQ chunks directly from a rope without Pack(), enabling more efficient memory usage. The existing parsers are updated to use the new rope-based parser when available. The BioSequence type is enhanced with a TakeQualities method for more efficient quality data handling.
This commit is contained in:
Eric Coissac
2026-03-10 16:46:53 +01:00
parent 623116ab13
commit 3d2e205722
5 changed files with 176 additions and 83 deletions

View File

@@ -29,70 +29,11 @@ const (
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation.
// The carry buffer (stack) handles lines that span two rope nodes.
type gbRopeScanner struct {
current *PieceOfChunk
pos int
carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin
carryN int
}
func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner {
return &gbRopeScanner{current: rope}
}
// ReadLine returns the next line without the trailing \n (or \r\n).
// Returns nil at end of rope. The returned slice aliases carry[] or the node
// data and is valid only until the next ReadLine call.
func (s *gbRopeScanner) ReadLine() []byte {
for {
if s.current == nil {
if s.carryN > 0 {
n := s.carryN
s.carryN = 0
return s.carry[:n]
}
return nil
}
data := s.current.data[s.pos:]
idx := bytes.IndexByte(data, '\n')
if idx >= 0 {
var line []byte
if s.carryN == 0 {
line = data[:idx]
} else {
n := copy(s.carry[s.carryN:], data[:idx])
s.carryN += n
line = s.carry[:s.carryN]
s.carryN = 0
}
s.pos += idx + 1
if s.pos >= len(s.current.data) {
s.current = s.current.Next()
s.pos = 0
}
if len(line) > 0 && line[len(line)-1] == '\r' {
line = line[:len(line)-1]
}
return line
}
// No \n in this node: accumulate into carry and advance
n := copy(s.carry[s.carryN:], data)
s.carryN += n
s.current = s.current.Next()
s.pos = 0
}
}
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
// appending compacted bases to dest. Returns the extended slice.
// Stops and returns when "//" is found at the start of a line.
// The scanner is left positioned after the "//" line.
func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
lineStart := true
skipDigits := true
@@ -139,24 +80,6 @@ func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
return dest
}
// skipToNewline advances the scanner past the next '\n'.
func (s *gbRopeScanner) skipToNewline() {
for s.current != nil {
data := s.current.data[s.pos:]
idx := bytes.IndexByte(data, '\n')
if idx >= 0 {
s.pos += idx + 1
if s.pos >= len(s.current.data) {
s.current = s.current.Next()
s.pos = 0
}
return
}
s.current = s.current.Next()
s.pos = 0
}
}
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
// Format: "LOCUS <id> <length> bp ..."
// Returns -1 if not found or parse error.
@@ -205,7 +128,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk,
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
state := inHeader
scanner := newGbRopeScanner(rope)
scanner := newRopeScanner(rope)
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
id := ""