mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
47
pkg/obikmer/minimizer_utils.go
Normal file
47
pkg/obikmer/minimizer_utils.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
|
||||
func DefaultMinimizerSize(k int) int {
|
||||
m := int(math.Ceil(float64(k) / 2.5))
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
|
||||
// i.e. ceil(log(nworkers) / log(4)).
|
||||
func MinMinimizerSize(nworkers int) int {
|
||||
if nworkers <= 1 {
|
||||
return 1
|
||||
}
|
||||
return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
|
||||
}
|
||||
|
||||
// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
|
||||
// - m >= ceil(log(nworkers)/log(4))
|
||||
// - 1 <= m < k
|
||||
func ValidateMinimizerSize(m, k, nworkers int) int {
|
||||
minM := MinMinimizerSize(nworkers)
|
||||
if m < minM {
|
||||
log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
|
||||
m, nworkers, m, 1<<(2*m), nworkers, minM)
|
||||
m = minM
|
||||
}
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
Reference in New Issue
Block a user