Add memory-based batching support

Implement memory-aware batch sizing with --batch-mem CLI option, enabling adaptive batching based on estimated sequence memory footprint. Key changes:
- Added _BatchMem and related getters/setters in pkg/obidefault
- Implemented RebatchBySize() in pkg/obiter for memory-constrained batching
- Added BioSequence.MemorySize() for conservative memory estimation
- Integrated batch-mem option in pkg/obioptions with human-readable size parsing (e.g., 128K, 64M, 1G)
- Added obiutils.ParseMemSize/FormatMemSize for unit conversion
- Enhanced pool GC in pkg/obiseq/pool.go to trigger explicit GC for large slice discards
- Updated sequence_reader.go to apply memory-based rebatching when enabled
This commit is contained in:
Eric Coissac
2026-03-13 14:54:14 +01:00
parent 74e6fcaf83
commit 40769bf827
7 changed files with 225 additions and 0 deletions

85
pkg/obiutils/memsize.go Normal file
View File

@@ -0,0 +1,85 @@
package obiutils
import (
"fmt"
"strconv"
"strings"
"unicode"
)
// ParseMemSize parses a human-readable memory size string and returns the
// equivalent number of bytes. The value is a number optionally followed by a
// unit suffix (case-insensitive):
//
// B or (no suffix) — bytes
// K or KB — kibibytes (1 024)
// M or MB — mebibytes (1 048 576)
// G or GB — gibibytes (1 073 741 824)
// T or TB — tebibytes (1 099 511 627 776)
//
// Examples: "512", "128K", "128k", "64M", "1G", "2GB"
func ParseMemSize(s string) (int, error) {
s = strings.TrimSpace(s)
if s == "" {
return 0, fmt.Errorf("empty memory size string")
}
// split numeric prefix from unit suffix
i := 0
for i < len(s) && (unicode.IsDigit(rune(s[i])) || s[i] == '.') {
i++
}
numStr := s[:i]
unit := strings.ToUpper(strings.TrimSpace(s[i:]))
// strip trailing 'B' from two-letter units (KB→K, MB→M …)
if len(unit) == 2 && unit[1] == 'B' {
unit = unit[:1]
}
val, err := strconv.ParseFloat(numStr, 64)
if err != nil {
return 0, fmt.Errorf("invalid memory size %q: %w", s, err)
}
var multiplier float64
switch unit {
case "", "B":
multiplier = 1
case "K":
multiplier = 1024
case "M":
multiplier = 1024 * 1024
case "G":
multiplier = 1024 * 1024 * 1024
case "T":
multiplier = 1024 * 1024 * 1024 * 1024
default:
return 0, fmt.Errorf("unknown memory unit %q in %q", unit, s)
}
return int(val * multiplier), nil
}
// FormatMemSize formats a byte count as a human-readable string with the
// largest unit that produces a value ≥ 1 (e.g. 1536 → "1.5K").
func FormatMemSize(n int) string {
units := []struct {
suffix string
size int
}{
{"T", 1024 * 1024 * 1024 * 1024},
{"G", 1024 * 1024 * 1024},
{"M", 1024 * 1024},
{"K", 1024},
}
for _, u := range units {
if n >= u.size {
v := float64(n) / float64(u.size)
if v == float64(int(v)) {
return fmt.Sprintf("%d%s", int(v), u.suffix)
}
return fmt.Sprintf("%.1f%s", v, u.suffix)
}
}
return fmt.Sprintf("%dB", n)
}