mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add memory-based batching support
Implement memory-aware batch sizing with --batch-mem CLI option, enabling adaptive batching based on estimated sequence memory footprint. Key changes: - Added _BatchMem and related getters/setters in pkg/obidefault - Implemented RebatchBySize() in pkg/obiter for memory-constrained batching - Added BioSequence.MemorySize() for conservative memory estimation - Integrated batch-mem option in pkg/obioptions with human-readable size parsing (e.g., 128K, 64M, 1G) - Added obiutils.ParseMemSize/FormatMemSize for unit conversion - Enhanced pool GC in pkg/obiseq/pool.go to trigger explicit GC for large slice discards - Updated sequence_reader.go to apply memory-based rebatching when enabled
This commit is contained in:
85
pkg/obiutils/memsize.go
Normal file
85
pkg/obiutils/memsize.go
Normal file
@@ -0,0 +1,85 @@
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// ParseMemSize parses a human-readable memory size string and returns the
|
||||
// equivalent number of bytes. The value is a number optionally followed by a
|
||||
// unit suffix (case-insensitive):
|
||||
//
|
||||
// B or (no suffix) — bytes
|
||||
// K or KB — kibibytes (1 024)
|
||||
// M or MB — mebibytes (1 048 576)
|
||||
// G or GB — gibibytes (1 073 741 824)
|
||||
// T or TB — tebibytes (1 099 511 627 776)
|
||||
//
|
||||
// Examples: "512", "128K", "128k", "64M", "1G", "2GB"
|
||||
func ParseMemSize(s string) (int, error) {
|
||||
s = strings.TrimSpace(s)
|
||||
if s == "" {
|
||||
return 0, fmt.Errorf("empty memory size string")
|
||||
}
|
||||
|
||||
// split numeric prefix from unit suffix
|
||||
i := 0
|
||||
for i < len(s) && (unicode.IsDigit(rune(s[i])) || s[i] == '.') {
|
||||
i++
|
||||
}
|
||||
numStr := s[:i]
|
||||
unit := strings.ToUpper(strings.TrimSpace(s[i:]))
|
||||
// strip trailing 'B' from two-letter units (KB→K, MB→M …)
|
||||
if len(unit) == 2 && unit[1] == 'B' {
|
||||
unit = unit[:1]
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("invalid memory size %q: %w", s, err)
|
||||
}
|
||||
|
||||
var multiplier float64
|
||||
switch unit {
|
||||
case "", "B":
|
||||
multiplier = 1
|
||||
case "K":
|
||||
multiplier = 1024
|
||||
case "M":
|
||||
multiplier = 1024 * 1024
|
||||
case "G":
|
||||
multiplier = 1024 * 1024 * 1024
|
||||
case "T":
|
||||
multiplier = 1024 * 1024 * 1024 * 1024
|
||||
default:
|
||||
return 0, fmt.Errorf("unknown memory unit %q in %q", unit, s)
|
||||
}
|
||||
|
||||
return int(val * multiplier), nil
|
||||
}
|
||||
|
||||
// FormatMemSize formats a byte count as a human-readable string with the
|
||||
// largest unit that produces a value ≥ 1 (e.g. 1536 → "1.5K").
|
||||
func FormatMemSize(n int) string {
|
||||
units := []struct {
|
||||
suffix string
|
||||
size int
|
||||
}{
|
||||
{"T", 1024 * 1024 * 1024 * 1024},
|
||||
{"G", 1024 * 1024 * 1024},
|
||||
{"M", 1024 * 1024},
|
||||
{"K", 1024},
|
||||
}
|
||||
for _, u := range units {
|
||||
if n >= u.size {
|
||||
v := float64(n) / float64(u.size)
|
||||
if v == float64(int(v)) {
|
||||
return fmt.Sprintf("%d%s", int(v), u.suffix)
|
||||
}
|
||||
return fmt.Sprintf("%.1f%s", v, u.suffix)
|
||||
}
|
||||
}
|
||||
return fmt.Sprintf("%dB", n)
|
||||
}
|
||||
Reference in New Issue
Block a user