mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Optimisation du parsing des grandes séquences
Implémente une optimisation du parsing des grandes séquences en évitant l'allocation de mémoire inutile lors de la fusion des chunks. Ajoute un support pour le parsing direct de la structure rope, ce qui permet de réduire les allocations et d'améliorer les performances lors du traitement de fichiers GenBank/EMBL et FASTA/FASTQ de plusieurs Gbp. Les parseurs sont mis à jour pour utiliser la rope non-packée et le nouveau mécanisme d'écriture in-place pour les séquences GenBank.
This commit is contained in:
@@ -196,6 +196,7 @@ func ReadEMBL(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, er
|
||||
1024*1024*128,
|
||||
EndOfLastFlatFileEntry,
|
||||
"\nID ",
|
||||
true,
|
||||
)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
@@ -245,6 +245,7 @@ func ReadFasta(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastaEntry,
|
||||
"\n>",
|
||||
true,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -339,6 +339,7 @@ func ReadFastq(reader io.Reader, options ...WithOption) (obiiter.IBioSequence, e
|
||||
1024*1024,
|
||||
EndOfLastFastqEntry,
|
||||
"\n@",
|
||||
true,
|
||||
)
|
||||
|
||||
for i := 0; i < nworker; i++ {
|
||||
|
||||
@@ -16,6 +16,7 @@ type SeqFileChunkParser func(string, io.Reader) (obiseq.BioSequenceSlice, error)
|
||||
type FileChunk struct {
|
||||
Source string
|
||||
Raw *bytes.Buffer
|
||||
Rope *PieceOfChunk
|
||||
Order int
|
||||
}
|
||||
|
||||
@@ -97,11 +98,17 @@ func (piece *PieceOfChunk) IsLast() bool {
|
||||
return piece.next == nil
|
||||
}
|
||||
|
||||
func (piece *PieceOfChunk) FileChunk(source string, order int) FileChunk {
|
||||
piece.Pack()
|
||||
func (piece *PieceOfChunk) FileChunk(source string, order int, pack bool) FileChunk {
|
||||
piece = piece.Head()
|
||||
var raw *bytes.Buffer
|
||||
if pack {
|
||||
piece.Pack()
|
||||
raw = bytes.NewBuffer(piece.data)
|
||||
}
|
||||
return FileChunk{
|
||||
Source: source,
|
||||
Raw: bytes.NewBuffer(piece.data),
|
||||
Raw: raw,
|
||||
Rope: piece,
|
||||
Order: order,
|
||||
}
|
||||
}
|
||||
@@ -133,7 +140,8 @@ func ReadFileChunk(
|
||||
reader io.Reader,
|
||||
fileChunkSize int,
|
||||
splitter LastSeqRecord,
|
||||
probe string) ChannelFileChunk {
|
||||
probe string,
|
||||
pack bool) ChannelFileChunk {
|
||||
|
||||
chunk_channel := make(ChannelFileChunk)
|
||||
|
||||
@@ -205,7 +213,7 @@ func ReadFileChunk(
|
||||
|
||||
if len(pieces.data) > 0 {
|
||||
// obilog.Warnf("chuck %d :Read %d bytes from file %s", i, io.Len(), source)
|
||||
chunk_channel <- pieces.FileChunk(source, i)
|
||||
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||
i++
|
||||
}
|
||||
|
||||
@@ -222,7 +230,7 @@ func ReadFileChunk(
|
||||
|
||||
// Send the last chunk to the channel
|
||||
if pieces.Len() > 0 {
|
||||
chunk_channel <- pieces.FileChunk(source, i)
|
||||
chunk_channel <- pieces.FileChunk(source, i, pack)
|
||||
}
|
||||
|
||||
// Close the readers channel when the end of the file is reached
|
||||
|
||||
@@ -233,6 +233,7 @@ func ReadGenbank(reader io.Reader, options ...WithOption) (obiiter.IBioSequence,
|
||||
1024*1024*128,
|
||||
EndOfLastFlatFileEntry,
|
||||
"\nLOCUS ",
|
||||
true,
|
||||
)
|
||||
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
|
||||
Reference in New Issue
Block a user