Add memory-based batching support

Implement memory-aware batch sizing with --batch-mem CLI option, enabling adaptive batching based on estimated sequence memory footprint. Key changes:
- Added _BatchMem and related getters/setters in pkg/obidefault
- Implemented RebatchBySize() in pkg/obiter for memory-constrained batching
- Added BioSequence.MemorySize() for conservative memory estimation
- Integrated batch-mem option in pkg/obioptions with human-readable size parsing (e.g., 128K, 64M, 1G)
- Added obiutils.ParseMemSize/FormatMemSize for unit conversion
- Enhanced pool GC in pkg/obiseq/pool.go to trigger explicit GC for large slice discards
- Updated sequence_reader.go to apply memory-based rebatching when enabled
This commit is contained in:
Eric Coissac
2026-03-13 14:54:14 +01:00
parent 74e6fcaf83
commit 40769bf827
7 changed files with 225 additions and 0 deletions

View File

@@ -1,13 +1,20 @@
package obiseq
import (
"runtime"
"sync"
"sync/atomic"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
const _LargeSliceThreshold = 100 * 1024 // 100 kb — below: leave to GC, above: trigger explicit GC
const _GCBytesBudget = int64(256 * 1024 * 1024) // trigger GC every 256 MB of large discards
var _largeSliceDiscardedBytes = atomic.Int64{}
var _BioSequenceByteSlicePool = sync.Pool{
New: func() interface{} {
bs := make([]byte, 0, 300)
@@ -34,6 +41,13 @@ func RecycleSlice(s *[]byte) {
}
if cap(*s) <= 1024 {
_BioSequenceByteSlicePool.Put(s)
} else if cap(*s) >= _LargeSliceThreshold {
n := int64(cap(*s))
*s = nil
prev := _largeSliceDiscardedBytes.Load()
if _largeSliceDiscardedBytes.Add(n)/_GCBytesBudget > prev/_GCBytesBudget {
runtime.GC()
}
}
}
}