mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Optimize Fasta batch formatting
Optimize FormatFastaBatch to pre-allocate buffer and write sequences directly without intermediate strings, improving performance and memory usage.
This commit is contained in:
@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
|
|||||||
//
|
//
|
||||||
// It returns a byte array containing the formatted sequences.
|
// It returns a byte array containing the formatted sequences.
|
||||||
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
|
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
|
||||||
// Create a buffer to store the formatted sequences
|
|
||||||
var bs bytes.Buffer
|
var bs bytes.Buffer
|
||||||
|
|
||||||
lt := 0
|
lt := 0
|
||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
lt += seq.Len()
|
lt += seq.Len()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Iterate over each sequence in the batch
|
// Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
|
||||||
|
bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
|
||||||
|
|
||||||
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
||||||
first := true
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
// Check if the sequence is empty
|
|
||||||
if seq.Len() > 0 {
|
if seq.Len() > 0 {
|
||||||
// Format the sequence using the provided formater function
|
// Write header directly into bs — no intermediate string
|
||||||
formattedSeq := FormatFasta(seq, formater)
|
bs.WriteByte('>')
|
||||||
|
bs.WriteString(seq.Id())
|
||||||
if first {
|
bs.WriteByte(' ')
|
||||||
bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4)
|
bs.WriteString(formater(seq))
|
||||||
first = false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Append the formatted sequence to the buffer
|
|
||||||
bs.WriteString(formattedSeq)
|
|
||||||
bs.WriteByte('\n')
|
bs.WriteByte('\n')
|
||||||
|
|
||||||
|
// Write folded sequence directly into bs — no copies
|
||||||
|
s := seq.Sequence()
|
||||||
|
l := len(s)
|
||||||
|
for i := 0; i < l; i += 60 {
|
||||||
|
to := i + 60
|
||||||
|
if to > l {
|
||||||
|
to = l
|
||||||
|
}
|
||||||
|
bs.Write(s[i:to])
|
||||||
|
bs.WriteByte('\n')
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// Handle empty sequences
|
|
||||||
if skipEmpty {
|
if skipEmpty {
|
||||||
// Skip empty sequences if skipEmpty is true
|
|
||||||
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
|
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
|
||||||
} else {
|
} else {
|
||||||
// Terminate the program if skipEmpty is false
|
|
||||||
log.Fatalf("Sequence %s is empty", seq.Id())
|
log.Fatalf("Sequence %s is empty", seq.Id())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return the byte array representation of the buffer
|
|
||||||
return &bs
|
return &bs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user