mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
86
pkg/obikmer/kdi_merge.go
Normal file
86
pkg/obikmer/kdi_merge.go
Normal file
@@ -0,0 +1,86 @@
|
||||
package obikmer
|
||||
|
||||
import "container/heap"
|
||||
|
||||
// mergeItem represents an element in the min-heap for k-way merge.
|
||||
type mergeItem struct {
|
||||
value uint64
|
||||
idx int // index of the reader that produced this value
|
||||
}
|
||||
|
||||
// mergeHeap implements heap.Interface for k-way merge.
|
||||
type mergeHeap []mergeItem
|
||||
|
||||
func (h mergeHeap) Len() int { return len(h) }
|
||||
func (h mergeHeap) Less(i, j int) bool { return h[i].value < h[j].value }
|
||||
func (h mergeHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
func (h *mergeHeap) Push(x interface{}) { *h = append(*h, x.(mergeItem)) }
|
||||
func (h *mergeHeap) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[:n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
// KWayMerge performs a k-way merge of multiple sorted KdiReader streams.
|
||||
// For each unique k-mer value, it reports the value and the number of
|
||||
// input streams that contained it (count).
|
||||
type KWayMerge struct {
|
||||
h mergeHeap
|
||||
readers []*KdiReader
|
||||
}
|
||||
|
||||
// NewKWayMerge creates a k-way merge from multiple KdiReaders.
|
||||
// Each reader must produce values in sorted (ascending) order.
|
||||
func NewKWayMerge(readers []*KdiReader) *KWayMerge {
|
||||
m := &KWayMerge{
|
||||
h: make(mergeHeap, 0, len(readers)),
|
||||
readers: readers,
|
||||
}
|
||||
|
||||
// Initialize heap with first value from each reader
|
||||
for i, r := range readers {
|
||||
if v, ok := r.Next(); ok {
|
||||
m.h = append(m.h, mergeItem{value: v, idx: i})
|
||||
}
|
||||
}
|
||||
heap.Init(&m.h)
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// Next returns the next smallest k-mer value, the number of readers
|
||||
// that contained this value (count), and true.
|
||||
// Returns (0, 0, false) when all streams are exhausted.
|
||||
func (m *KWayMerge) Next() (kmer uint64, count int, ok bool) {
|
||||
if len(m.h) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
minVal := m.h[0].value
|
||||
count = 0
|
||||
|
||||
// Pop all items with the same value
|
||||
for len(m.h) > 0 && m.h[0].value == minVal {
|
||||
item := heap.Pop(&m.h).(mergeItem)
|
||||
count++
|
||||
// Advance that reader
|
||||
if v, ok := m.readers[item.idx].Next(); ok {
|
||||
heap.Push(&m.h, mergeItem{value: v, idx: item.idx})
|
||||
}
|
||||
}
|
||||
|
||||
return minVal, count, true
|
||||
}
|
||||
|
||||
// Close closes all underlying readers.
|
||||
func (m *KWayMerge) Close() error {
|
||||
var firstErr error
|
||||
for _, r := range m.readers {
|
||||
if err := r.Close(); err != nil && firstErr == nil {
|
||||
firstErr = err
|
||||
}
|
||||
}
|
||||
return firstErr
|
||||
}
|
||||
Reference in New Issue
Block a user