Files
obitools4/pkg/obikmer/kmer_set_group_quorum.go

236 lines
6.1 KiB
Go
Raw Normal View History

package obikmer
import (
"container/heap"
"github.com/RoaringBitmap/roaring/roaring64"
)
// heapItem represents an element in the min-heap for k-way merge
type heapItem struct {
value uint64
idx int
}
// kmerMinHeap implements heap.Interface for k-way merge algorithm
type kmerMinHeap []heapItem
func (h kmerMinHeap) Len() int { return len(h) }
func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value }
func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *kmerMinHeap) Push(x interface{}) {
*h = append(*h, x.(heapItem))
}
func (h *kmerMinHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// QuorumAtLeast returns k-mers present in at least q sets
//
// Algorithm: K-way merge with min-heap counting
//
// The algorithm processes all k-mers in sorted order using a min-heap:
//
// 1. Initialize one iterator per non-empty set
// 2. Build a min-heap of (value, set_index) pairs, one per iterator
// 3. While heap is not empty:
// a. Extract the minimum value v from heap
// b. Pop ALL heap items with value == v (counting occurrences)
// c. If count >= q, add v to result
// d. Advance each popped iterator and re-insert into heap if valid
//
// This ensures each unique k-mer is counted exactly once across all sets.
//
// Time complexity: O(M log N)
// - M = sum of all set cardinalities (total k-mer occurrences)
// - N = number of sets
// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations
// - Each heap operation costs O(log N)
//
// Space complexity: O(N)
// - Heap contains at most N elements (one per set iterator)
// - Output bitmap size depends on quorum result
//
// Special cases (optimized):
// - q <= 0: returns empty set
// - q == 1: delegates to Union() (native OR operations)
// - q == n: delegates to Intersect() (native AND operations)
// - q > n: returns empty set (impossible to satisfy)
func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 || n == 0 {
return NewKmerSet(ksg.k)
}
if q > n {
return NewKmerSet(ksg.k)
}
if q == 1 {
return ksg.Union()
}
if q == n {
return ksg.Intersect()
}
// Initialize iterators for all non-empty sets
iterators := make([]roaring64.IntIterable64, 0, n)
iterIndices := make([]int, 0, n)
for i, set := range ksg.sets {
if set.Len() > 0 {
iter := set.bitmap.Iterator()
if iter.HasNext() {
iterators = append(iterators, iter)
iterIndices = append(iterIndices, i)
}
}
}
if len(iterators) == 0 {
return NewKmerSet(ksg.k)
}
// Initialize heap with first value from each iterator
h := make(kmerMinHeap, len(iterators))
for i, iter := range iterators {
h[i] = heapItem{value: iter.Next(), idx: i}
}
heap.Init(&h)
// Result bitmap
result := roaring64.New()
// K-way merge with counting
for len(h) > 0 {
minVal := h[0].value
count := 0
activeIndices := make([]int, 0, len(h))
// Pop all elements with same value (count occurrences)
for len(h) > 0 && h[0].value == minVal {
item := heap.Pop(&h).(heapItem)
count++
activeIndices = append(activeIndices, item.idx)
}
// Add to result if quorum reached
if count >= q {
result.Add(minVal)
}
// Advance iterators and re-insert into heap
for _, iterIdx := range activeIndices {
if iterators[iterIdx].HasNext() {
heap.Push(&h, heapItem{
value: iterators[iterIdx].Next(),
idx: iterIdx,
})
}
}
}
return NewKmerSetFromBitmap(ksg.k, result)
}
// QuorumAtMost returns k-mers present in at most q sets
//
// Algorithm: Uses the mathematical identity
// AtMost(q) = Union() - AtLeast(q+1)
//
// Proof:
// - Union() contains all k-mers present in at least 1 set
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
// - Their difference contains only k-mers present in at most q sets
//
// Implementation:
// 1. Compute U = Union()
// 2. Compute A = QuorumAtLeast(q+1)
// 3. Return U - A using bitmap AndNot operation
//
// Time complexity: O(M log N)
// - Union(): O(M) with native OR operations
// - QuorumAtLeast(q+1): O(M log N)
// - AndNot: O(|U|) where |U| <= M
// - Total: O(M log N)
//
// Space complexity: O(N)
// - Inherited from QuorumAtLeast heap
//
// Special cases:
// - q <= 0: returns empty set
// - q >= n: returns Union() (all k-mers are in at most n sets)
func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 {
return NewKmerSet(ksg.k)
}
if q >= n {
return ksg.Union()
}
// Compute Union() - AtLeast(q+1)
union := ksg.Union()
atLeastQ1 := ksg.QuorumAtLeast(q + 1)
// Difference: elements in union but not in atLeastQ1
result := union.bitmap.Clone()
result.AndNot(atLeastQ1.bitmap)
return NewKmerSetFromBitmap(ksg.k, result)
}
// QuorumExactly returns k-mers present in exactly q sets
//
// Algorithm: Uses the mathematical identity
// Exactly(q) = AtLeast(q) - AtLeast(q+1)
//
// Proof:
// - AtLeast(q) contains all k-mers present in q or more sets
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
// - Their difference contains only k-mers present in exactly q sets
//
// Implementation:
// 1. Compute A = QuorumAtLeast(q)
// 2. Compute B = QuorumAtLeast(q+1)
// 3. Return A - B using bitmap AndNot operation
//
// Time complexity: O(M log N)
// - Two calls to QuorumAtLeast: 2 * O(M log N)
// - One AndNot operation: O(|A|) where |A| <= M
// - Total: O(M log N) since AndNot is dominated by merge operations
//
// Space complexity: O(N)
// - Inherited from QuorumAtLeast heap
// - Two temporary bitmaps for intermediate results
//
// Special cases:
// - q <= 0: returns empty set
// - q > n: returns empty set (impossible to have k-mer in more than n sets)
func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 || q > n {
return NewKmerSet(ksg.k)
}
// Compute AtLeast(q) - AtLeast(q+1)
aq := ksg.QuorumAtLeast(q)
aq1 := ksg.QuorumAtLeast(q + 1)
// Difference: elements in aq but not in aq1
result := aq.bitmap.Clone()
result.AndNot(aq1.bitmap)
return NewKmerSetFromBitmap(ksg.k, result)
}