Add memory-based batching support

Implement memory-aware batch sizing with --batch-mem CLI option, enabling adaptive batching based on estimated sequence memory footprint. Key changes: - Added _BatchMem and related getters/setters in pkg/obidefault - Implemented RebatchBySize() in pkg/obiter for memory-constrained batching - Added BioSequence.MemorySize() for conservative memory estimation - Integrated batch-mem option in pkg/obioptions with human-readable size parsing (e.g., 128K, 64M, 1G) - Added obiutils.ParseMemSize/FormatMemSize for unit conversion - Enhanced pool GC in pkg/obiseq/pool.go to trigger explicit GC for large slice discards - Updated sequence_reader.go to apply memory-based rebatching when enabled
2026-03-25 13:30:52 +00:00 · 2026-03-13 14:54:14 +01:00
parent 74e6fcaf83
commit 40769bf827
7 changed files with 225 additions and 0 deletions
--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@@ -273,6 +273,28 @@ func (s *BioSequence) Len() int {
 	return len(s.sequence)
 }

+// MemorySize returns an estimate of the memory footprint of the BioSequence
+// in bytes. It accounts for the sequence, quality scores, feature data,
+// annotations, and fixed struct overhead. The estimate is conservative
+// (cap rather than len for byte slices) so it is suitable for memory-based
+// batching decisions.
+func (s *BioSequence) MemorySize() int {
+	if s == nil {
+		return 0
+	}
+	// fixed struct overhead (strings, pointers, mutex pointer)
+	const overhead = 128
+	n := overhead
+	n += cap(s.sequence)
+	n += cap(s.qualities)
+	n += cap(s.feature)
+	n += len(s.id)
+	n += len(s.source)
+	// rough annotation estimate: each key+value pair ~64 bytes on average
+	n += len(s.annotations) * 64
+	return n
+}
+
 // HasQualities checks if the BioSequence has sequence qualitiy scores.
 //
 // This function does not have any parameters.