mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor rope scanner and add FASTQ rope parser
This commit refactors the rope scanner implementation by renaming gbRopeScanner to ropeScanner and extracting the common functionality into a new file. It also introduces a new FastqChunkParserRope function that parses FASTQ chunks directly from a rope without Pack(), enabling more efficient memory usage. The existing parsers are updated to use the new rope-based parser when available. The BioSequence type is enhanced with a TakeQualities method for more efficient quality data handling.
This commit is contained in:
@@ -29,70 +29,11 @@ const (
|
||||
|
||||
var _seqlenght_rx = regexp.MustCompile(" +([0-9]+) bp")
|
||||
|
||||
// gbRopeScanner reads lines from a PieceOfChunk rope without heap allocation.
|
||||
// The carry buffer (stack) handles lines that span two rope nodes.
|
||||
type gbRopeScanner struct {
|
||||
current *PieceOfChunk
|
||||
pos int
|
||||
carry [256]byte // max GenBank line = 80 chars; 256 gives ample margin
|
||||
carryN int
|
||||
}
|
||||
|
||||
func newGbRopeScanner(rope *PieceOfChunk) *gbRopeScanner {
|
||||
return &gbRopeScanner{current: rope}
|
||||
}
|
||||
|
||||
// ReadLine returns the next line without the trailing \n (or \r\n).
|
||||
// Returns nil at end of rope. The returned slice aliases carry[] or the node
|
||||
// data and is valid only until the next ReadLine call.
|
||||
func (s *gbRopeScanner) ReadLine() []byte {
|
||||
for {
|
||||
if s.current == nil {
|
||||
if s.carryN > 0 {
|
||||
n := s.carryN
|
||||
s.carryN = 0
|
||||
return s.carry[:n]
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
|
||||
if idx >= 0 {
|
||||
var line []byte
|
||||
if s.carryN == 0 {
|
||||
line = data[:idx]
|
||||
} else {
|
||||
n := copy(s.carry[s.carryN:], data[:idx])
|
||||
s.carryN += n
|
||||
line = s.carry[:s.carryN]
|
||||
s.carryN = 0
|
||||
}
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
if len(line) > 0 && line[len(line)-1] == '\r' {
|
||||
line = line[:len(line)-1]
|
||||
}
|
||||
return line
|
||||
}
|
||||
|
||||
// No \n in this node: accumulate into carry and advance
|
||||
n := copy(s.carry[s.carryN:], data)
|
||||
s.carryN += n
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// extractSequence scans the ORIGIN section byte-by-byte directly on the rope,
|
||||
// appending compacted bases to dest. Returns the extended slice.
|
||||
// Stops and returns when "//" is found at the start of a line.
|
||||
// The scanner is left positioned after the "//" line.
|
||||
func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
func (s *ropeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
lineStart := true
|
||||
skipDigits := true
|
||||
|
||||
@@ -139,24 +80,6 @@ func (s *gbRopeScanner) extractSequence(dest []byte, UtoT bool) []byte {
|
||||
return dest
|
||||
}
|
||||
|
||||
// skipToNewline advances the scanner past the next '\n'.
|
||||
func (s *gbRopeScanner) skipToNewline() {
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
idx := bytes.IndexByte(data, '\n')
|
||||
if idx >= 0 {
|
||||
s.pos += idx + 1
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
}
|
||||
|
||||
// parseLseqFromLocus extracts the declared sequence length from a LOCUS line.
|
||||
// Format: "LOCUS <id> <length> bp ..."
|
||||
// Returns -1 if not found or parse error.
|
||||
@@ -205,7 +128,7 @@ func GenbankChunkParserRope(source string, rope *PieceOfChunk,
|
||||
withFeatureTable, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
state := inHeader
|
||||
scanner := newGbRopeScanner(rope)
|
||||
scanner := newRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
id := ""
|
||||
|
||||
Reference in New Issue
Block a user