Optimize Fasta batch formatting

Optimize FormatFastaBatch to pre-allocate buffer and write sequences directly without intermediate strings, improving performance and memory usage.
This commit is contained in:
Eric Coissac
2026-03-10 15:43:53 +01:00
parent 761e0dbed3
commit b246025907

View File

@@ -77,45 +77,47 @@ func FormatFasta(seq *obiseq.BioSequence, formater FormatHeader) string {
// //
// It returns a byte array containing the formatted sequences. // It returns a byte array containing the formatted sequences.
func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer { func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, skipEmpty bool) *bytes.Buffer {
// Create a buffer to store the formatted sequences
var bs bytes.Buffer var bs bytes.Buffer
lt := 0 lt := 0
for _, seq := range batch.Slice() { for _, seq := range batch.Slice() {
lt += seq.Len() lt += seq.Len()
} }
// Iterate over each sequence in the batch // Pre-allocate: sequence data + newlines every 60 chars + ~100 bytes header per sequence
bs.Grow(lt + lt/60 + 100*batch.Len() + 1)
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len()) log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
first := true
for _, seq := range batch.Slice() { for _, seq := range batch.Slice() {
// Check if the sequence is empty
if seq.Len() > 0 { if seq.Len() > 0 {
// Format the sequence using the provided formater function // Write header directly into bs — no intermediate string
formattedSeq := FormatFasta(seq, formater) bs.WriteByte('>')
bs.WriteString(seq.Id())
if first { bs.WriteByte(' ')
bs.Grow(lt + (len(formattedSeq)-seq.Len())*batch.Len()*5/4) bs.WriteString(formater(seq))
first = false
}
// Append the formatted sequence to the buffer
bs.WriteString(formattedSeq)
bs.WriteByte('\n') bs.WriteByte('\n')
// Write folded sequence directly into bs — no copies
s := seq.Sequence()
l := len(s)
for i := 0; i < l; i += 60 {
to := i + 60
if to > l {
to = l
}
bs.Write(s[i:to])
bs.WriteByte('\n')
}
} else { } else {
// Handle empty sequences
if skipEmpty { if skipEmpty {
// Skip empty sequences if skipEmpty is true
obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id()) obilog.Warnf("Sequence %s is empty and skipped in output", seq.Id())
} else { } else {
// Terminate the program if skipEmpty is false
log.Fatalf("Sequence %s is empty", seq.Id()) log.Fatalf("Sequence %s is empty", seq.Id())
} }
} }
} }
// Return the byte array representation of the buffer
return &bs return &bs
} }