mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Add rope-based FASTA parsing and improve sequence handling
Introduce FastaChunkParserRope for direct rope-based FASTA parsing, enhance sequence extraction with whitespace skipping and U->T conversion, and update parser logic to support both rope and raw data sources. - Added extractFastaSeq function to scan sequence bytes directly from rope - Implemented FastaChunkParserRope for rope-based parsing - Modified _ParseFastaFile to use rope when available - Updated sequence handling to support U->T conversion - Fixed line ending detection for FASTA parsing
This commit is contained in:
@@ -209,28 +209,121 @@ func FastaChunkParser(UtoT bool) func(string, io.Reader) (obiseq.BioSequenceSlic
|
||||
return parser
|
||||
}
|
||||
|
||||
// extractFastaSeq scans sequence bytes from the rope directly into dest,
|
||||
// appending valid nucleotide characters and skipping whitespace.
|
||||
// Stops when '>' is found at the start of a line (next record) or at EOF.
|
||||
// Returns (dest with appended bases, hasMore).
|
||||
// hasMore=true means scanner is now positioned at '>' of the next record.
|
||||
func (s *gbRopeScanner) extractFastaSeq(dest []byte, UtoT bool) ([]byte, bool) {
|
||||
lineStart := true
|
||||
|
||||
for s.current != nil {
|
||||
data := s.current.data[s.pos:]
|
||||
for i, b := range data {
|
||||
if lineStart && b == '>' {
|
||||
s.pos += i
|
||||
if s.pos >= len(s.current.data) {
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, true
|
||||
}
|
||||
if b == '\n' || b == '\r' {
|
||||
lineStart = true
|
||||
continue
|
||||
}
|
||||
lineStart = false
|
||||
if b == ' ' || b == '\t' {
|
||||
continue
|
||||
}
|
||||
if b >= 'A' && b <= 'Z' {
|
||||
b += 'a' - 'A'
|
||||
}
|
||||
if UtoT && b == 'u' {
|
||||
b = 't'
|
||||
}
|
||||
dest = append(dest, b)
|
||||
}
|
||||
s.current = s.current.Next()
|
||||
s.pos = 0
|
||||
}
|
||||
return dest, false
|
||||
}
|
||||
|
||||
// FastaChunkParserRope parses a FASTA chunk directly from the rope without Pack().
|
||||
func FastaChunkParserRope(source string, rope *PieceOfChunk, UtoT bool) (obiseq.BioSequenceSlice, error) {
|
||||
scanner := newGbRopeScanner(rope)
|
||||
sequences := obiseq.MakeBioSequenceSlice(100)[:0]
|
||||
|
||||
for {
|
||||
bline := scanner.ReadLine()
|
||||
if bline == nil {
|
||||
break
|
||||
}
|
||||
if len(bline) == 0 || bline[0] != '>' {
|
||||
continue
|
||||
}
|
||||
|
||||
// Parse header: ">id definition"
|
||||
header := bline[1:]
|
||||
var id string
|
||||
var definition string
|
||||
sp := bytes.IndexByte(header, ' ')
|
||||
if sp < 0 {
|
||||
sp = bytes.IndexByte(header, '\t')
|
||||
}
|
||||
if sp < 0 {
|
||||
id = string(header)
|
||||
} else {
|
||||
id = string(header[:sp])
|
||||
definition = string(bytes.TrimSpace(header[sp+1:]))
|
||||
}
|
||||
|
||||
seqDest := make([]byte, 0, 4096)
|
||||
var hasMore bool
|
||||
seqDest, hasMore = scanner.extractFastaSeq(seqDest, UtoT)
|
||||
|
||||
if len(seqDest) == 0 {
|
||||
log.Fatalf("%s [%s]: sequence is empty", source, id)
|
||||
}
|
||||
|
||||
seq := obiseq.NewBioSequenceOwning(id, seqDest, definition)
|
||||
seq.SetSource(source)
|
||||
sequences = append(sequences, seq)
|
||||
|
||||
if !hasMore {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return sequences, nil
|
||||
}
|
||||
|
||||
func _ParseFastaFile(
|
||||
input ChannelFileChunk,
|
||||
out obiiter.IBioSequence,
|
||||
UtoT bool,
|
||||
) {
|
||||
|
||||
parser := FastaChunkParser(UtoT)
|
||||
|
||||
for chunks := range input {
|
||||
sequences, err := parser(chunks.Source, chunks.Raw)
|
||||
// obilog.Warnf("Chunck(%d:%d) -%d- ", chunks.Order, l, sequences.Len())
|
||||
var sequences obiseq.BioSequenceSlice
|
||||
var err error
|
||||
|
||||
if chunks.Rope != nil {
|
||||
sequences, err = FastaChunkParserRope(chunks.Source, chunks.Rope, UtoT)
|
||||
} else {
|
||||
sequences, err = parser(chunks.Source, chunks.Raw)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("File %s : Cannot parse the fasta file : %v", chunks.Source, err)
|
||||
}
|
||||
|
||||
out.Push(obiiter.MakeBioSequenceBatch(chunks.Source, chunks.Order, sequences))
|
||||
|
||||
}
|
||||
|
||||
out.Done()
|
||||
|
||||
}
|
||||
|
||||
func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, error) {
|
||||
@@ -245,7 +338,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastaEntry,
|
||||
"\n>",
|
||||
true,
|
||||
false,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
Reference in New Issue
Block a user