Refactor kmer index to disk-based partitioning with minimizer

Refactor kmer index package to use disk-based partitioning with minimizer - Replace roaring64 bitmaps with disk-based kmer index - Implement partitioned kmer sets with delta-varint encoding - Add support for frequency filtering during construction - Introduce new builder pattern for index construction - Add streaming operations for set operations (union, intersect, etc.) - Add support for super-kmer encoding during construction - Update command line tool to use new index format - Remove dependency on roaring bitmap library This change introduces a new architecture for kmer indexing that is more memory efficient and scalable for large datasets.
2026-05-09 23:50:38 +00:00 · 2026-02-09 17:50:33 +01:00
parent 09d437d10f
commit a016ad5b8a
8 changed files with 987 additions and 0 deletions
@@ -174,6 +174,13 @@ func (ff *FrequencyFilter) AddSequences(sequences *obiseq.BioSequenceSlice) {
 	}
 }

+// AddSequenceSlice adds all k-mers from a slice of sequences to the filter
+func (ff *FrequencyFilter) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
+	for _, seq := range *sequences {
+		ff.AddSequence(seq)
+	}
+}
+
 // ==================================
 // PERSISTANCE
 // ==================================
@@ -0,0 +1,204 @@
+package obikmer
+
+import (
+	"math"
+	"sync"
+
+	log "github.com/sirupsen/logrus"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+)
+
+// DefaultMinimizerSize returns ceil(k / 2.5) as a reasonable default minimizer size.
+func DefaultMinimizerSize(k int) int {
+	m := int(math.Ceil(float64(k) / 2.5))
+	if m < 1 {
+		m = 1
+	}
+	if m >= k {
+		m = k - 1
+	}
+	return m
+}
+
+// MinMinimizerSize returns the minimum m such that 4^m >= nworkers,
+// i.e. ceil(log(nworkers) / log(4)).
+func MinMinimizerSize(nworkers int) int {
+	if nworkers <= 1 {
+		return 1
+	}
+	return int(math.Ceil(math.Log(float64(nworkers)) / math.Log(4)))
+}
+
+// ValidateMinimizerSize checks and adjusts the minimizer size to satisfy constraints:
+// - m >= ceil(log(nworkers)/log(4))
+// - 1 <= m < k
+func ValidateMinimizerSize(m, k, nworkers int) int {
+	minM := MinMinimizerSize(nworkers)
+	if m < minM {
+		log.Warnf("Minimizer size %d too small for %d workers (4^%d = %d < %d), adjusting to %d",
+			m, nworkers, m, 1<<(2*m), nworkers, minM)
+		m = minM
+	}
+	if m < 1 {
+		m = 1
+	}
+	if m >= k {
+		m = k - 1
+	}
+	return m
+}
+
+// BuildKmerIndex builds a KmerSet from an iterator using parallel super-kmer partitioning.
+//
+// The algorithm:
+//  1. Extract super-kmers from each sequence using IterSuperKmers
+//  2. Route each super-kmer to a worker based on minimizer % nworkers
+//  3. Each worker extracts canonical k-mers and adds them to its local KmerSet
+//  4. Merge all KmerSets via Union
+//
+// Parameters:
+//   - iterator: source of BioSequence batches
+//   - k: k-mer size (1-31)
+//   - m: minimizer size (1 to k-1)
+func BuildKmerIndex(iterator obiiter.IBioSequence, k, m int) *KmerSet {
+	nproc := obidefault.ParallelWorkers()
+	m = ValidateMinimizerSize(m, k, nproc)
+
+	// Channels to route super-kmers to workers
+	channels := make([]chan SuperKmer, nproc)
+	for i := range channels {
+		channels[i] = make(chan SuperKmer, 1024)
+	}
+
+	// Workers: each manages a partition of the minimizer space
+	sets := make([]*KmerSet, nproc)
+	waiter := sync.WaitGroup{}
+	waiter.Add(nproc)
+	for i := 0; i < nproc; i++ {
+		sets[i] = NewKmerSet(k)
+		go func(ch chan SuperKmer, ks *KmerSet) {
+			defer waiter.Done()
+			for sk := range ch {
+				for kmer := range IterCanonicalKmers(sk.Sequence, k) {
+					ks.AddKmerCode(kmer)
+				}
+			}
+		}(channels[i], sets[i])
+	}
+
+	// Reader: extract super-kmers and route them
+	seqCount := 0
+	for iterator.Next() {
+		batch := iterator.Get()
+		for _, seq := range batch.Slice() {
+			rawSeq := seq.Sequence()
+			if len(rawSeq) < k {
+				continue
+			}
+			for sk := range IterSuperKmers(rawSeq, k, m) {
+				worker := int(sk.Minimizer % uint64(nproc))
+				channels[worker] <- sk
+			}
+			seqCount++
+		}
+	}
+
+	// Close channels to signal workers to finish
+	for _, ch := range channels {
+		close(ch)
+	}
+	waiter.Wait()
+
+	log.Infof("Processed %d sequences", seqCount)
+
+	// Merge partitions (mostly disjoint -> fast union)
+	result := sets[0]
+	for i := 1; i < nproc; i++ {
+		result.bitmap.Or(sets[i].bitmap)
+	}
+
+	log.Infof("Index contains %d k-mers (%.2f MB)",
+		result.Len(), float64(result.MemoryUsage())/1024/1024)
+
+	return result
+}
+
+// BuildFrequencyFilterIndex builds a FrequencyFilter from an iterator
+// using parallel super-kmer partitioning.
+//
+// Each worker manages its own FrequencyFilter for its partition of the
+// minimizer space. Since all k-mers sharing a minimizer go to the same worker,
+// the frequency counting is correct per partition.
+//
+// Parameters:
+//   - iterator: source of BioSequence batches
+//   - k: k-mer size (1-31)
+//   - m: minimizer size (1 to k-1)
+//   - minFreq: minimum frequency threshold (>= 1)
+func BuildFrequencyFilterIndex(iterator obiiter.IBioSequence, k, m, minFreq int) *FrequencyFilter {
+	nproc := obidefault.ParallelWorkers()
+	m = ValidateMinimizerSize(m, k, nproc)
+
+	// Channels to route super-kmers to workers
+	channels := make([]chan SuperKmer, nproc)
+	for i := range channels {
+		channels[i] = make(chan SuperKmer, 1024)
+	}
+
+	// Workers: each manages a local FrequencyFilter
+	filters := make([]*FrequencyFilter, nproc)
+	waiter := sync.WaitGroup{}
+	waiter.Add(nproc)
+	for i := 0; i < nproc; i++ {
+		filters[i] = NewFrequencyFilter(k, minFreq)
+		go func(ch chan SuperKmer, ff *FrequencyFilter) {
+			defer waiter.Done()
+			for sk := range ch {
+				for kmer := range IterCanonicalKmers(sk.Sequence, k) {
+					ff.AddKmerCode(kmer)
+				}
+			}
+		}(channels[i], filters[i])
+	}
+
+	// Reader: extract super-kmers and route them
+	seqCount := 0
+	for iterator.Next() {
+		batch := iterator.Get()
+		for _, seq := range batch.Slice() {
+			rawSeq := seq.Sequence()
+			if len(rawSeq) < k {
+				continue
+			}
+			for sk := range IterSuperKmers(rawSeq, k, m) {
+				worker := int(sk.Minimizer % uint64(nproc))
+				channels[worker] <- sk
+			}
+			seqCount++
+		}
+	}
+
+	// Close channels to signal workers to finish
+	for _, ch := range channels {
+		close(ch)
+	}
+	waiter.Wait()
+
+	log.Infof("Processed %d sequences", seqCount)
+
+	// Merge FrequencyFilters: union level by level
+	result := filters[0]
+	for i := 1; i < nproc; i++ {
+		for level := 0; level < minFreq; level++ {
+			result.Get(level).bitmap.Or(filters[i].Get(level).bitmap)
+		}
+	}
+
+	stats := result.Stats()
+	log.Infof("FrequencyFilter: %d k-mers with freq >= %d (%.2f MB total)",
+		stats.FilteredKmers, minFreq, float64(stats.TotalBytes)/1024/1024)
+
+	return result
+}
@@ -82,6 +82,13 @@ func (ks *KmerSet) AddSequences(sequences *obiseq.BioSequenceSlice) {
 	}
 }

+// AddSequenceSlice adds all k-mers from a slice of sequences
+func (ks *KmerSet) AddSequenceSlice(sequences *obiseq.BioSequenceSlice) {
+	for _, seq := range *sequences {
+		ks.AddSequence(seq)
+	}
+}
+
 // Contains checks if a k-mer is in the set
 func (ks *KmerSet) Contains(kmer uint64) bool {
 	return ks.bitmap.Contains(kmer)
@@ -145,6 +145,14 @@ func (ksg *KmerSetGroup) AddSequences(sequences *obiseq.BioSequenceSlice, index
 	ksg.sets[index].AddSequences(sequences)
 }

+// AddSequenceSlice adds all k-mers from a slice of sequences to a specific KmerSet
+func (ksg *KmerSetGroup) AddSequenceSlice(sequences *obiseq.BioSequenceSlice, index int) {
+	if index < 0 || index >= len(ksg.sets) {
+		panic(fmt.Sprintf("Index out of bounds: %d (size: %d)", index, len(ksg.sets)))
+	}
+	ksg.sets[index].AddSequenceSlice(sequences)
+}
+
 // Union returns the union of all KmerSet in the group
 // Optimization: starts from the largest set to minimize operations
 func (ksg *KmerSetGroup) Union() *KmerSet {