mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor kmer index to disk-based partitioning with minimizer
Refactor kmer index package to use disk-based partitioning with minimizer - Replace roaring64 bitmaps with disk-based kmer index - Implement partitioned kmer sets with delta-varint encoding - Add support for frequency filtering during construction - Introduce new builder pattern for index construction - Add streaming operations for set operations (union, intersect, etc.) - Add support for super-kmer encoding during construction - Update command line tool to use new index format - Remove dependency on roaring bitmap library This change introduces a new architecture for kmer indexing that is more memory efficient and scalable for large datasets.
This commit is contained in:
@@ -174,6 +174,13 @@ func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
||||
}
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences to the filter
|
||||
func (ff *FrequencyFilter) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ff.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================
|
||||
// PERSISTANCE
|
||||
// ==================================
|
||||
|
||||
204
pkg/obikmer/kmer_index_builder.go
Normal file
204
pkg/obikmer/kmer_index_builder.go
Normal file
@@ -0,0 +1,204 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"math"
|
||||
"sync"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
)
|
||||
|
||||
// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
|
||||
func DefaultMinimizerSize(k int) int {
|
||||
m := int(math.Ceil(float64(k) / 2.5))
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
|
||||
// i.e. ceil(log(nworkers) / log(4)).
|
||||
func MinMinimizerSize(nworkers int) int {
|
||||
if nworkers <= 1 {
|
||||
return 1
|
||||
}
|
||||
return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
|
||||
}
|
||||
|
||||
// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
|
||||
// - m >= ceil(log(nworkers)/log(4))
|
||||
// - 1 <= m < k
|
||||
func ValidateMinimizerSize(m, k, nworkers int) int {
|
||||
minM := MinMinimizerSize(nworkers)
|
||||
if m < minM {
|
||||
log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
|
||||
m, nworkers, m, 1<<(2*m), nworkers, minM)
|
||||
m = minM
|
||||
}
|
||||
if m < 1 {
|
||||
m = 1
|
||||
}
|
||||
if m >= k {
|
||||
m = k - 1
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// BuildKmerIndex builds a KmerSet from an iterator using parallel super-kmer partitioning.
|
||||
//
|
||||
// The algorithm:
|
||||
// 1. Extract super-kmers from each sequence using IterSuperKmers
|
||||
// 2. Route each super-kmer to a worker based on minimizer % nworkers
|
||||
// 3. Each worker extracts canonical k-mers and adds them to its local KmerSet
|
||||
// 4. Merge all KmerSets via Union
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: source of BioSequence batches
|
||||
// - k: k-mer size (1-31)
|
||||
// - m: minimizer size (1 to k-1)
|
||||
func BuildKmerIndex(iterator obiiter.IBioSequence, k, m int) *KmerSet {
|
||||
nproc := obidefault.ParallelWorkers()
|
||||
m = ValidateMinimizerSize(m, k, nproc)
|
||||
|
||||
// Channels to route super-kmers to workers
|
||||
channels := make([]chan SuperKmer, nproc)
|
||||
for i := range channels {
|
||||
channels[i] = make(chan SuperKmer, 1024)
|
||||
}
|
||||
|
||||
// Workers: each manages a partition of the minimizer space
|
||||
sets := make([]*KmerSet, nproc)
|
||||
waiter := sync.WaitGroup{}
|
||||
waiter.Add(nproc)
|
||||
for i := 0; i < nproc; i++ {
|
||||
sets[i] = NewKmerSet(k)
|
||||
go func(ch chan SuperKmer, ks *KmerSet) {
|
||||
defer waiter.Done()
|
||||
for sk := range ch {
|
||||
for kmer := range IterCanonicalKmers(sk.Sequence, k) {
|
||||
ks.AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
}(channels[i], sets[i])
|
||||
}
|
||||
|
||||
// Reader: extract super-kmers and route them
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
rawSeq := seq.Sequence()
|
||||
if len(rawSeq) < k {
|
||||
continue
|
||||
}
|
||||
for sk := range IterSuperKmers(rawSeq, k, m) {
|
||||
worker := int(sk.Minimizer % uint64(nproc))
|
||||
channels[worker] <- sk
|
||||
}
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Close channels to signal workers to finish
|
||||
for _, ch := range channels {
|
||||
close(ch)
|
||||
}
|
||||
waiter.Wait()
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Merge partitions (mostly disjoint -> fast union)
|
||||
result := sets[0]
|
||||
for i := 1; i < nproc; i++ {
|
||||
result.bitmap.Or(sets[i].bitmap)
|
||||
}
|
||||
|
||||
log.Infof("Index contains %d k-mers (%.2f MB)",
|
||||
result.Len(), float64(result.MemoryUsage())/1024/1024)
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// BuildFrequencyFilterIndex builds a FrequencyFilter from an iterator
|
||||
// using parallel super-kmer partitioning.
|
||||
//
|
||||
// Each worker manages its own FrequencyFilter for its partition of the
|
||||
// minimizer space. Since all k-mers sharing a minimizer go to the same worker,
|
||||
// the frequency counting is correct per partition.
|
||||
//
|
||||
// Parameters:
|
||||
// - iterator: source of BioSequence batches
|
||||
// - k: k-mer size (1-31)
|
||||
// - m: minimizer size (1 to k-1)
|
||||
// - minFreq: minimum frequency threshold (>= 1)
|
||||
func BuildFrequencyFilterIndex(iterator obiiter.IBioSequence, k, m, minFreq int) *FrequencyFilter {
|
||||
nproc := obidefault.ParallelWorkers()
|
||||
m = ValidateMinimizerSize(m, k, nproc)
|
||||
|
||||
// Channels to route super-kmers to workers
|
||||
channels := make([]chan SuperKmer, nproc)
|
||||
for i := range channels {
|
||||
channels[i] = make(chan SuperKmer, 1024)
|
||||
}
|
||||
|
||||
// Workers: each manages a local FrequencyFilter
|
||||
filters := make([]*FrequencyFilter, nproc)
|
||||
waiter := sync.WaitGroup{}
|
||||
waiter.Add(nproc)
|
||||
for i := 0; i < nproc; i++ {
|
||||
filters[i] = NewFrequencyFilter(k, minFreq)
|
||||
go func(ch chan SuperKmer, ff *FrequencyFilter) {
|
||||
defer waiter.Done()
|
||||
for sk := range ch {
|
||||
for kmer := range IterCanonicalKmers(sk.Sequence, k) {
|
||||
ff.AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
}(channels[i], filters[i])
|
||||
}
|
||||
|
||||
// Reader: extract super-kmers and route them
|
||||
seqCount := 0
|
||||
for iterator.Next() {
|
||||
batch := iterator.Get()
|
||||
for _, seq := range batch.Slice() {
|
||||
rawSeq := seq.Sequence()
|
||||
if len(rawSeq) < k {
|
||||
continue
|
||||
}
|
||||
for sk := range IterSuperKmers(rawSeq, k, m) {
|
||||
worker := int(sk.Minimizer % uint64(nproc))
|
||||
channels[worker] <- sk
|
||||
}
|
||||
seqCount++
|
||||
}
|
||||
}
|
||||
|
||||
// Close channels to signal workers to finish
|
||||
for _, ch := range channels {
|
||||
close(ch)
|
||||
}
|
||||
waiter.Wait()
|
||||
|
||||
log.Infof("Processed %d sequences", seqCount)
|
||||
|
||||
// Merge FrequencyFilters: union level by level
|
||||
result := filters[0]
|
||||
for i := 1; i < nproc; i++ {
|
||||
for level := 0; level < minFreq; level++ {
|
||||
result.Get(level).bitmap.Or(filters[i].Get(level).bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
stats := result.Stats()
|
||||
log.Infof("FrequencyFilter: %d k-mers with freq >= %d (%.2f MB total)",
|
||||
stats.FilteredKmers, minFreq, float64(stats.TotalBytes)/1024/1024)
|
||||
|
||||
return result
|
||||
}
|
||||
@@ -82,6 +82,13 @@ func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
|
||||
}
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences
|
||||
func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
|
||||
for _, seq := range *sequences {
|
||||
ks.AddSequence(seq)
|
||||
}
|
||||
}
|
||||
|
||||
// Contains checks if a k-mer is in the set
|
||||
func (ks *KmerSet) Contains(kmer uint64) bool {
|
||||
return ks.bitmap.Contains(kmer)
|
||||
|
||||
@@ -145,6 +145,14 @@ func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index
|
||||
ksg.sets[index].AddSequences(sequences)
|
||||
}
|
||||
|
||||
// AddSequenceSlice adds all k-mers from a slice of sequences to a specific KmerSet
|
||||
func (ksg *KmerSetGroup) AddSequenceSlice(sequences *obiseq.BioSequenceSlice, index int) {
|
||||
if index < 0 || index >= len(ksg.sets) {
|
||||
panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
|
||||
}
|
||||
ksg.sets[index].AddSequenceSlice(sequences)
|
||||
}
|
||||
|
||||
// Union returns the union of all KmerSet in the group
|
||||
// Optimization: starts from the largest set to minimize operations
|
||||
func (ksg *KmerSetGroup) Union() *KmerSet {
|
||||
|
||||
99
pkg/obitools/obikindex/obikindex.go
Normal file
99
pkg/obitools/obikindex/obikindex.go
Normal file
@@ -0,0 +1,99 @@
|
||||
package obikindex
|
||||
|
||||
import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
)
|
||||
|
||||
// CLIBuildKmerIndex reads sequences from the iterator and builds a kmer index
|
||||
// saved as a roaring bitmap directory.
|
||||
func CLIBuildKmerIndex(iterator obiiter.IBioSequence) {
|
||||
// Validate output directory
|
||||
outDir := CLIOutputDirectory()
|
||||
if outDir == "" || outDir == "-" {
|
||||
log.Fatalf("Error: --out option is required and must specify a directory path (not stdout)")
|
||||
}
|
||||
|
||||
// Validate k-mer size
|
||||
k := CLIKmerSize()
|
||||
if k < 2 || k > 31 {
|
||||
log.Fatalf("Invalid k-mer size: %d (must be between 2 and 31)", k)
|
||||
}
|
||||
|
||||
// Resolve minimizer size
|
||||
m := CLIMinimizerSize()
|
||||
|
||||
// Validate min-occurrence
|
||||
minOcc := CLIMinOccurrence()
|
||||
if minOcc < 1 {
|
||||
log.Fatalf("Invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
// Resolve metadata format
|
||||
format := CLIMetadataFormat()
|
||||
|
||||
log.Infof("Building kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
|
||||
if minOcc <= 1 {
|
||||
// Simple KmerSet mode
|
||||
ks := obikmer.BuildKmerIndex(iterator, k, m)
|
||||
|
||||
// Apply metadata
|
||||
applyKmerSetMetadata(ks)
|
||||
|
||||
// Save
|
||||
log.Infof("Saving KmerSet to %s", outDir)
|
||||
if err := ks.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save kmer index: %v", err)
|
||||
}
|
||||
} else {
|
||||
// FrequencyFilter mode
|
||||
ff := obikmer.BuildFrequencyFilterIndex(iterator, k, m, minOcc)
|
||||
|
||||
if CLISaveFullFilter() {
|
||||
// Save the full filter (all levels)
|
||||
applyMetadataGroup(ff.KmerSetGroup)
|
||||
|
||||
log.Infof("Saving full FrequencyFilter to %s", outDir)
|
||||
if err := ff.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save frequency filter: %v", err)
|
||||
}
|
||||
} else {
|
||||
// Save only the filtered KmerSet (k-mers with freq >= minOcc)
|
||||
ks := ff.GetFilteredSet()
|
||||
applyKmerSetMetadata(ks)
|
||||
ks.SetAttribute("min_occurrence", minOcc)
|
||||
|
||||
log.Infof("Saving filtered KmerSet (freq >= %d) to %s", minOcc, outDir)
|
||||
if err := ks.Save(outDir, format); err != nil {
|
||||
log.Fatalf("Failed to save filtered kmer index: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Info("Done.")
|
||||
}
|
||||
|
||||
// applyKmerSetMetadata sets index-id and --set-tag metadata on a KmerSet.
|
||||
func applyKmerSetMetadata(ks *obikmer.KmerSet) {
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ks.SetId(id)
|
||||
}
|
||||
|
||||
for key, value := range CLISetTag() {
|
||||
ks.SetAttribute(key, value)
|
||||
}
|
||||
}
|
||||
|
||||
// applyMetadataGroup sets index-id and --set-tag metadata on a KmerSetGroup.
|
||||
func applyMetadataGroup(ksg *obikmer.KmerSetGroup) {
|
||||
if id := CLIIndexId(); id != "" {
|
||||
ksg.SetId(id)
|
||||
}
|
||||
|
||||
for key, value := range CLISetTag() {
|
||||
ksg.SetAttribute(key, value)
|
||||
}
|
||||
}
|
||||
131
pkg/obitools/obikindex/options.go
Normal file
131
pkg/obitools/obikindex/options.go
Normal file
@@ -0,0 +1,131 @@
|
||||
package obikindex
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// Private variables for storing option values
|
||||
var _kmerSize = 31
|
||||
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
|
||||
var _indexId = ""
|
||||
var _metadataFormat = "toml"
|
||||
var _setTag = make(map[string]string, 0)
|
||||
var _minOccurrence = 1
|
||||
var _saveFullFilter = false
|
||||
|
||||
// KmerIndexOptionSet defines every option related to kmer index building.
|
||||
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.Alias("k"),
|
||||
options.Description("Size of k-mers (must be between 2 and 31)."))
|
||||
|
||||
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
|
||||
options.Alias("m"),
|
||||
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
|
||||
|
||||
options.StringVar(&_indexId, "index-id", _indexId,
|
||||
options.Description("Identifier for the kmer index."))
|
||||
|
||||
options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
|
||||
options.Description("Format for metadata file (toml, yaml, json)."))
|
||||
|
||||
options.StringMapVar(&_setTag, "set-tag", 1, 1,
|
||||
options.Alias("S"),
|
||||
options.ArgName("KEY=VALUE"),
|
||||
options.Description("Adds a metadata attribute KEY with value VALUE to the index."))
|
||||
|
||||
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
|
||||
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
|
||||
|
||||
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
|
||||
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
|
||||
}
|
||||
|
||||
// OptionSet adds to the basic option set every option declared for
|
||||
// the obikindex command.
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputModeOptionSet(options, false)
|
||||
KmerIndexOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
// CLIMinimizerSize returns the effective minimizer size.
|
||||
// If -1 (auto), computes ceil(k / 2.5) then applies constraints:
|
||||
// - minimum: ceil(log(nworkers) / log(4))
|
||||
// - maximum: k - 1
|
||||
func CLIMinimizerSize() int {
|
||||
m := _minimizerSize
|
||||
if m < 0 {
|
||||
m = obikmer.DefaultMinimizerSize(_kmerSize)
|
||||
}
|
||||
nworkers := obidefault.ParallelWorkers()
|
||||
m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
|
||||
return m
|
||||
}
|
||||
|
||||
// CLIIndexId returns the index identifier.
|
||||
func CLIIndexId() string {
|
||||
return _indexId
|
||||
}
|
||||
|
||||
// CLIMetadataFormat returns the metadata format.
|
||||
func CLIMetadataFormat() obikmer.MetadataFormat {
|
||||
switch strings.ToLower(_metadataFormat) {
|
||||
case "toml":
|
||||
return obikmer.FormatTOML
|
||||
case "yaml":
|
||||
return obikmer.FormatYAML
|
||||
case "json":
|
||||
return obikmer.FormatJSON
|
||||
default:
|
||||
log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
|
||||
return obikmer.FormatTOML
|
||||
}
|
||||
}
|
||||
|
||||
// CLISetTag returns the metadata key=value pairs.
|
||||
func CLISetTag() map[string]string {
|
||||
return _setTag
|
||||
}
|
||||
|
||||
// CLIMinOccurrence returns the minimum occurrence threshold.
|
||||
func CLIMinOccurrence() int {
|
||||
return _minOccurrence
|
||||
}
|
||||
|
||||
// CLISaveFullFilter returns whether to save the full frequency filter.
|
||||
func CLISaveFullFilter() bool {
|
||||
return _saveFullFilter
|
||||
}
|
||||
|
||||
// CLIOutputDirectory returns the output directory path.
|
||||
func CLIOutputDirectory() string {
|
||||
return obiconvert.CLIOutPutFileName()
|
||||
}
|
||||
|
||||
// SetKmerSize sets the k-mer size (for testing).
|
||||
func SetKmerSize(k int) {
|
||||
_kmerSize = k
|
||||
}
|
||||
|
||||
// SetMinimizerSize sets the minimizer size (for testing).
|
||||
func SetMinimizerSize(m int) {
|
||||
_minimizerSize = m
|
||||
}
|
||||
|
||||
// SetMinOccurrence sets the minimum occurrence (for testing).
|
||||
func SetMinOccurrence(n int) {
|
||||
_minOccurrence = n
|
||||
}
|
||||
Reference in New Issue
Block a user