Refactor k-mer normalization functions and add quorum operations

This commit refactors the k-mer normalization functions, renaming them from 'NormalizeKmer' to 'CanonicalKmer' to better reflect their purpose of returning canonical k-mers. It also introduces new quorum operations (AtLeast, AtMost, Exactly) for k-mer set groups, along with comprehensive tests and benchmarks. The version commit hash has also been updated.
This commit is contained in:
Eric Coissac
2026-02-05 17:11:14 +01:00
parent a43e6258be
commit aa2e94dd6f
4 changed files with 671 additions and 41 deletions

View File

@@ -352,8 +352,8 @@ func TestReverseComplementInvolution(t *testing.T) {
} }
} }
// TestNormalizeKmer tests the normalization function // TestCanonicalKmer tests the normalization function
func TestNormalizeKmer(t *testing.T) { func TestCanonicalKmer(t *testing.T) {
tests := []struct { tests := []struct {
name string name string
seq string seq string
@@ -374,7 +374,7 @@ func TestNormalizeKmer(t *testing.T) {
kmer := kmers[0] kmer := kmers[0]
rc := ReverseComplement(kmer, tt.k) rc := ReverseComplement(kmer, tt.k)
normalized := NormalizeKmer(kmer, tt.k) normalized := CanonicalKmer(kmer, tt.k)
// Normalized should be the minimum // Normalized should be the minimum
expectedNorm := kmer expectedNorm := kmer
@@ -383,27 +383,27 @@ func TestNormalizeKmer(t *testing.T) {
} }
if normalized != expectedNorm { if normalized != expectedNorm {
t.Errorf("NormalizeKmer(%d) = %d, want %d", kmer, normalized, expectedNorm) t.Errorf("CanonicalKmer(%d) = %d, want %d", kmer, normalized, expectedNorm)
} }
// Normalizing the RC should give the same result // Normalizing the RC should give the same result
normalizedRC := NormalizeKmer(rc, tt.k) normalizedRC := CanonicalKmer(rc, tt.k)
if normalizedRC != normalized { if normalizedRC != normalized {
t.Errorf("NormalizeKmer(RC) = %d, want %d (same as NormalizeKmer(fwd))", normalizedRC, normalized) t.Errorf("CanonicalKmer(RC) = %d, want %d (same as CanonicalKmer(fwd))", normalizedRC, normalized)
} }
}) })
} }
} }
// TestEncodeNormalizedKmersBasic tests basic normalized k-mer encoding // TestEncodeCanonicalKmersBasic tests basic normalized k-mer encoding
func TestEncodeNormalizedKmersBasic(t *testing.T) { func TestEncodeCanonicalKmersBasic(t *testing.T) {
// Test that a sequence and its reverse complement produce the same normalized k-mers // Test that a sequence and its reverse complement produce the same normalized k-mers
seq := []byte("AACGTT") seq := []byte("AACGTT")
revComp := []byte("AACGTT") // This is a palindrome! revComp := []byte("AACGTT") // This is a palindrome!
k := 4 k := 4
kmers1 := EncodeNormalizedKmers(seq, k, nil) kmers1 := EncodeCanonicalKmers(seq, k, nil)
kmers2 := EncodeNormalizedKmers(revComp, k, nil) kmers2 := EncodeCanonicalKmers(revComp, k, nil)
if len(kmers1) != len(kmers2) { if len(kmers1) != len(kmers2) {
t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
@@ -417,8 +417,8 @@ func TestEncodeNormalizedKmersBasic(t *testing.T) {
} }
} }
// TestEncodeNormalizedKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed) // TestEncodeCanonicalKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed)
func TestEncodeNormalizedKmersSymmetry(t *testing.T) { func TestEncodeCanonicalKmersSymmetry(t *testing.T) {
// Manually construct a sequence and its reverse complement // Manually construct a sequence and its reverse complement
seq := []byte("ACGTAACCGG") seq := []byte("ACGTAACCGG")
@@ -430,8 +430,8 @@ func TestEncodeNormalizedKmersSymmetry(t *testing.T) {
} }
k := 4 k := 4
kmers1 := EncodeNormalizedKmers(seq, k, nil) kmers1 := EncodeCanonicalKmers(seq, k, nil)
kmers2 := EncodeNormalizedKmers(revComp, k, nil) kmers2 := EncodeCanonicalKmers(revComp, k, nil)
if len(kmers1) != len(kmers2) { if len(kmers1) != len(kmers2) {
t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
@@ -446,14 +446,14 @@ func TestEncodeNormalizedKmersSymmetry(t *testing.T) {
} }
} }
// TestEncodeNormalizedKmersConsistency verifies normalized k-mers match manual normalization // TestEncodeCanonicalKmersConsistency verifies normalized k-mers match manual normalization
func TestEncodeNormalizedKmersConsistency(t *testing.T) { func TestEncodeCanonicalKmersConsistency(t *testing.T) {
seq := []byte("ACGTACGTACGTACGT") seq := []byte("ACGTACGTACGTACGT")
k := 8 k := 8
// Get k-mers both ways // Get k-mers both ways
rawKmers := EncodeKmers(seq, k, nil) rawKmers := EncodeKmers(seq, k, nil)
normalizedKmers := EncodeNormalizedKmers(seq, k, nil) normalizedKmers := EncodeCanonicalKmers(seq, k, nil)
if len(rawKmers) != len(normalizedKmers) { if len(rawKmers) != len(normalizedKmers) {
t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers)) t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers))
@@ -461,16 +461,16 @@ func TestEncodeNormalizedKmersConsistency(t *testing.T) {
// Verify each normalized k-mer matches manual normalization // Verify each normalized k-mer matches manual normalization
for i, raw := range rawKmers { for i, raw := range rawKmers {
expected := NormalizeKmer(raw, k) expected := CanonicalKmer(raw, k)
if normalizedKmers[i] != expected { if normalizedKmers[i] != expected {
t.Errorf("position %d: EncodeNormalizedKmers gave %d, NormalizeKmer gave %d", t.Errorf("position %d: EncodeCanonicalKmers gave %d, CanonicalKmer gave %d",
i, normalizedKmers[i], expected) i, normalizedKmers[i], expected)
} }
} }
} }
// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function // BenchmarkEncodeCanonicalKmers benchmarks the normalized encoding function
func BenchmarkEncodeNormalizedKmers(b *testing.B) { func BenchmarkEncodeCanonicalKmers(b *testing.B) {
sizes := []int{100, 1000, 10000, 100000} sizes := []int{100, 1000, 10000, 100000}
kSizes := []int{8, 16, 31} kSizes := []int{8, 16, 31}
@@ -488,7 +488,7 @@ func BenchmarkEncodeNormalizedKmers(b *testing.B) {
b.SetBytes(int64(size)) b.SetBytes(int64(size))
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
EncodeNormalizedKmers(seq, k, &buffer) EncodeCanonicalKmers(seq, k, &buffer)
} }
}) })
} }
@@ -506,14 +506,14 @@ func BenchmarkReverseComplement(b *testing.B) {
} }
} }
// BenchmarkNormalizeKmer benchmarks the normalization function // BenchmarkCanonicalKmer benchmarks the normalization function
func BenchmarkNormalizeKmer(b *testing.B) { func BenchmarkCanonicalKmer(b *testing.B) {
kmer := uint64(0x06C6C6C6C6C6C6C6) kmer := uint64(0x06C6C6C6C6C6C6C6)
k := 31 k := 31
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
NormalizeKmer(kmer, k) CanonicalKmer(kmer, k)
} }
} }
@@ -730,7 +730,7 @@ func TestExtractSuperKmersCanonical(t *testing.T) {
for i, sk := range result { for i, sk := range result {
// Verify the minimizer is indeed canonical (equal to its normalized form) // Verify the minimizer is indeed canonical (equal to its normalized form)
normalized := NormalizeKmer(sk.Minimizer, m) normalized := CanonicalKmer(sk.Minimizer, m)
if sk.Minimizer != normalized { if sk.Minimizer != normalized {
t.Errorf("super k-mer %d: minimizer %d is not canonical (normalized: %d)", t.Errorf("super k-mer %d: minimizer %d is not canonical (normalized: %d)",
i, sk.Minimizer, normalized) i, sk.Minimizer, normalized)
@@ -886,8 +886,8 @@ func TestKmerErrorMarkersWithRealKmers(t *testing.T) {
} }
// Verify normalization works with error bits cleared // Verify normalization works with error bits cleared
normalized1 := NormalizeKmer(originalKmer, k) normalized1 := CanonicalKmer(originalKmer, k)
normalized2 := NormalizeKmer(ClearKmerError(marked), k) normalized2 := CanonicalKmer(ClearKmerError(marked), k)
if normalized1 != normalized2 { if normalized1 != normalized2 {
t.Errorf("Normalization affected by error bits") t.Errorf("Normalization affected by error bits")
} }
@@ -977,8 +977,8 @@ func TestReverseComplementPreservesErrorBits(t *testing.T) {
} }
} }
// TestNormalizeKmerWithErrorBits tests that NormalizeKmer works with error bits // TestCanonicalKmerWithErrorBits tests that CanonicalKmer works with error bits
func TestNormalizeKmerWithErrorBits(t *testing.T) { func TestCanonicalKmerWithErrorBits(t *testing.T) {
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
k := 31 k := 31
@@ -995,7 +995,7 @@ func TestNormalizeKmerWithErrorBits(t *testing.T) {
marked := SetKmerError(originalKmer, errCode) marked := SetKmerError(originalKmer, errCode)
// Normalize should work on the sequence part // Normalize should work on the sequence part
normalized := NormalizeKmer(marked, k) normalized := CanonicalKmer(marked, k)
// Error bits should be preserved // Error bits should be preserved
if GetKmerError(normalized) != errCode { if GetKmerError(normalized) != errCode {
@@ -1004,7 +1004,7 @@ func TestNormalizeKmerWithErrorBits(t *testing.T) {
// The sequence part should be normalized // The sequence part should be normalized
cleanNormalized := ClearKmerError(normalized) cleanNormalized := ClearKmerError(normalized)
expectedNormalized := NormalizeKmer(ClearKmerError(marked), k) expectedNormalized := CanonicalKmer(ClearKmerError(marked), k)
if cleanNormalized != expectedNormalized { if cleanNormalized != expectedNormalized {
t.Errorf("Normalization incorrect with error bits present") t.Errorf("Normalization incorrect with error bits present")
@@ -1081,19 +1081,19 @@ func TestIterKmers(t *testing.T) {
} }
} }
// TestIterNormalizedKmers tests the normalized k-mer iterator // TestIterCanonicalKmers tests the normalized k-mer iterator
func TestIterNormalizedKmers(t *testing.T) { func TestIterCanonicalKmers(t *testing.T) {
seq := []byte("ACGTACGTACGT") seq := []byte("ACGTACGTACGT")
k := 6 k := 6
// Collect k-mers via iterator // Collect k-mers via iterator
var iterKmers []uint64 var iterKmers []uint64
for kmer := range IterNormalizedKmers(seq, k) { for kmer := range IterCanonicalKmers(seq, k) {
iterKmers = append(iterKmers, kmer) iterKmers = append(iterKmers, kmer)
} }
// Compare with slice-based version // Compare with slice-based version
sliceKmers := EncodeNormalizedKmers(seq, k, nil) sliceKmers := EncodeCanonicalKmers(seq, k, nil)
if len(iterKmers) != len(sliceKmers) { if len(iterKmers) != len(sliceKmers) {
t.Errorf("length mismatch: iter=%d, slice=%d", len(iterKmers), len(sliceKmers)) t.Errorf("length mismatch: iter=%d, slice=%d", len(iterKmers), len(sliceKmers))
@@ -1151,8 +1151,8 @@ func BenchmarkIterKmers(b *testing.B) {
}) })
} }
// BenchmarkIterNormalizedKmers benchmarks the normalized iterator // BenchmarkIterCanonicalKmers benchmarks the normalized iterator
func BenchmarkIterNormalizedKmers(b *testing.B) { func BenchmarkIterCanonicalKmers(b *testing.B) {
seq := make([]byte, 10000) seq := make([]byte, 10000)
for i := range seq { for i := range seq {
seq[i] = "ACGT"[i%4] seq[i] = "ACGT"[i%4]
@@ -1163,7 +1163,7 @@ func BenchmarkIterNormalizedKmers(b *testing.B) {
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
count := 0 count := 0
for range IterNormalizedKmers(seq, k) { for range IterCanonicalKmers(seq, k) {
count++ count++
} }
} }
@@ -1173,7 +1173,7 @@ func BenchmarkIterNormalizedKmers(b *testing.B) {
var buffer []uint64 var buffer []uint64
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
buffer = EncodeNormalizedKmers(seq, k, &buffer) buffer = EncodeCanonicalKmers(seq, k, &buffer)
} }
}) })
} }

View File

@@ -0,0 +1,235 @@
package obikmer
import (
"container/heap"
"github.com/RoaringBitmap/roaring/roaring64"
)
// heapItem represents an element in the min-heap for k-way merge
type heapItem struct {
value uint64
idx int
}
// kmerMinHeap implements heap.Interface for k-way merge algorithm
type kmerMinHeap []heapItem
func (h kmerMinHeap) Len() int { return len(h) }
func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value }
func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *kmerMinHeap) Push(x interface{}) {
*h = append(*h, x.(heapItem))
}
func (h *kmerMinHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[0 : n-1]
return x
}
// QuorumAtLeast returns k-mers present in at least q sets
//
// Algorithm: K-way merge with min-heap counting
//
// The algorithm processes all k-mers in sorted order using a min-heap:
//
// 1. Initialize one iterator per non-empty set
// 2. Build a min-heap of (value, set_index) pairs, one per iterator
// 3. While heap is not empty:
// a. Extract the minimum value v from heap
// b. Pop ALL heap items with value == v (counting occurrences)
// c. If count >= q, add v to result
// d. Advance each popped iterator and re-insert into heap if valid
//
// This ensures each unique k-mer is counted exactly once across all sets.
//
// Time complexity: O(M log N)
// - M = sum of all set cardinalities (total k-mer occurrences)
// - N = number of sets
// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations
// - Each heap operation costs O(log N)
//
// Space complexity: O(N)
// - Heap contains at most N elements (one per set iterator)
// - Output bitmap size depends on quorum result
//
// Special cases (optimized):
// - q <= 0: returns empty set
// - q == 1: delegates to Union() (native OR operations)
// - q == n: delegates to Intersect() (native AND operations)
// - q > n: returns empty set (impossible to satisfy)
func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 || n == 0 {
return NewKmerSet(ksg.k)
}
if q > n {
return NewKmerSet(ksg.k)
}
if q == 1 {
return ksg.Union()
}
if q == n {
return ksg.Intersect()
}
// Initialize iterators for all non-empty sets
iterators := make([]roaring64.IntIterable64, 0, n)
iterIndices := make([]int, 0, n)
for i, set := range ksg.sets {
if set.Len() > 0 {
iter := set.bitmap.Iterator()
if iter.HasNext() {
iterators = append(iterators, iter)
iterIndices = append(iterIndices, i)
}
}
}
if len(iterators) == 0 {
return NewKmerSet(ksg.k)
}
// Initialize heap with first value from each iterator
h := make(kmerMinHeap, len(iterators))
for i, iter := range iterators {
h[i] = heapItem{value: iter.Next(), idx: i}
}
heap.Init(&h)
// Result bitmap
result := roaring64.New()
// K-way merge with counting
for len(h) > 0 {
minVal := h[0].value
count := 0
activeIndices := make([]int, 0, len(h))
// Pop all elements with same value (count occurrences)
for len(h) > 0 && h[0].value == minVal {
item := heap.Pop(&h).(heapItem)
count++
activeIndices = append(activeIndices, item.idx)
}
// Add to result if quorum reached
if count >= q {
result.Add(minVal)
}
// Advance iterators and re-insert into heap
for _, iterIdx := range activeIndices {
if iterators[iterIdx].HasNext() {
heap.Push(&h, heapItem{
value: iterators[iterIdx].Next(),
idx: iterIdx,
})
}
}
}
return NewKmerSetFromBitmap(ksg.k, result)
}
// QuorumAtMost returns k-mers present in at most q sets
//
// Algorithm: Uses the mathematical identity
// AtMost(q) = Union() - AtLeast(q+1)
//
// Proof:
// - Union() contains all k-mers present in at least 1 set
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
// - Their difference contains only k-mers present in at most q sets
//
// Implementation:
// 1. Compute U = Union()
// 2. Compute A = QuorumAtLeast(q+1)
// 3. Return U - A using bitmap AndNot operation
//
// Time complexity: O(M log N)
// - Union(): O(M) with native OR operations
// - QuorumAtLeast(q+1): O(M log N)
// - AndNot: O(|U|) where |U| <= M
// - Total: O(M log N)
//
// Space complexity: O(N)
// - Inherited from QuorumAtLeast heap
//
// Special cases:
// - q <= 0: returns empty set
// - q >= n: returns Union() (all k-mers are in at most n sets)
func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 {
return NewKmerSet(ksg.k)
}
if q >= n {
return ksg.Union()
}
// Compute Union() - AtLeast(q+1)
union := ksg.Union()
atLeastQ1 := ksg.QuorumAtLeast(q + 1)
// Difference: elements in union but not in atLeastQ1
result := union.bitmap.Clone()
result.AndNot(atLeastQ1.bitmap)
return NewKmerSetFromBitmap(ksg.k, result)
}
// QuorumExactly returns k-mers present in exactly q sets
//
// Algorithm: Uses the mathematical identity
// Exactly(q) = AtLeast(q) - AtLeast(q+1)
//
// Proof:
// - AtLeast(q) contains all k-mers present in q or more sets
// - AtLeast(q+1) contains all k-mers present in q+1 or more sets
// - Their difference contains only k-mers present in exactly q sets
//
// Implementation:
// 1. Compute A = QuorumAtLeast(q)
// 2. Compute B = QuorumAtLeast(q+1)
// 3. Return A - B using bitmap AndNot operation
//
// Time complexity: O(M log N)
// - Two calls to QuorumAtLeast: 2 * O(M log N)
// - One AndNot operation: O(|A|) where |A| <= M
// - Total: O(M log N) since AndNot is dominated by merge operations
//
// Space complexity: O(N)
// - Inherited from QuorumAtLeast heap
// - Two temporary bitmaps for intermediate results
//
// Special cases:
// - q <= 0: returns empty set
// - q > n: returns empty set (impossible to have k-mer in more than n sets)
func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet {
n := len(ksg.sets)
// Edge cases
if q <= 0 || q > n {
return NewKmerSet(ksg.k)
}
// Compute AtLeast(q) - AtLeast(q+1)
aq := ksg.QuorumAtLeast(q)
aq1 := ksg.QuorumAtLeast(q + 1)
// Difference: elements in aq but not in aq1
result := aq.bitmap.Clone()
result.AndNot(aq1.bitmap)
return NewKmerSetFromBitmap(ksg.k, result)
}

View File

@@ -0,0 +1,395 @@
package obikmer
import (
"testing"
)
// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast
func TestQuorumAtLeastEdgeCases(t *testing.T) {
k := 5
// Test group with all empty sets
emptyGroup := NewKmerSetGroup(k, 3)
result := emptyGroup.QuorumAtLeast(1)
if result.Len() != 0 {
t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len())
}
// Test q <= 0
group := NewKmerSetGroup(k, 3)
result = group.QuorumAtLeast(0)
if result.Len() != 0 {
t.Errorf("q=0: expected 0 k-mers, got %d", result.Len())
}
result = group.QuorumAtLeast(-1)
if result.Len() != 0 {
t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len())
}
// Test q > n
group.Get(0).AddKmerCode(1)
result = group.QuorumAtLeast(10)
if result.Len() != 0 {
t.Errorf("q>n: expected 0 k-mers, got %d", result.Len())
}
}
// TestQuorumAtLeastQ1 tests q=1 (should equal Union)
func TestQuorumAtLeastQ1(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 3)
// Add different k-mers to each set
group.Get(0).AddKmerCode(1)
group.Get(0).AddKmerCode(2)
group.Get(1).AddKmerCode(2)
group.Get(1).AddKmerCode(3)
group.Get(2).AddKmerCode(3)
group.Get(2).AddKmerCode(4)
quorum := group.QuorumAtLeast(1)
union := group.Union()
if quorum.Len() != union.Len() {
t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len())
}
// Check all elements match
for kmer := uint64(1); kmer <= 4; kmer++ {
if quorum.Contains(kmer) != union.Contains(kmer) {
t.Errorf("Mismatch for k-mer %d", kmer)
}
}
}
// TestQuorumAtLeastQN tests q=n (should equal Intersect)
func TestQuorumAtLeastQN(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 3)
// Add some common k-mers and some unique
for i := 0; i < 3; i++ {
group.Get(i).AddKmerCode(10) // common to all
group.Get(i).AddKmerCode(20) // common to all
}
group.Get(0).AddKmerCode(1) // unique to set 0
group.Get(1).AddKmerCode(2) // unique to set 1
quorum := group.QuorumAtLeast(3)
intersect := group.Intersect()
if quorum.Len() != intersect.Len() {
t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len())
}
if quorum.Len() != 2 {
t.Errorf("Expected 2 common k-mers, got %d", quorum.Len())
}
if !quorum.Contains(10) || !quorum.Contains(20) {
t.Error("Missing common k-mers")
}
if quorum.Contains(1) || quorum.Contains(2) {
t.Error("Unique k-mers should not be in result")
}
}
// TestQuorumAtLeastGeneral tests general quorum values
func TestQuorumAtLeastGeneral(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 5)
// Setup: k-mer i appears in i sets (for i=1..5)
// k-mer 1: in set 0
// k-mer 2: in sets 0,1
// k-mer 3: in sets 0,1,2
// k-mer 4: in sets 0,1,2,3
// k-mer 5: in sets 0,1,2,3,4 (all)
for kmer := uint64(1); kmer <= 5; kmer++ {
for setIdx := 0; setIdx < int(kmer); setIdx++ {
group.Get(setIdx).AddKmerCode(kmer)
}
}
tests := []struct {
q int
expected map[uint64]bool
}{
{1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}},
{2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}},
{3, map[uint64]bool{3: true, 4: true, 5: true}},
{4, map[uint64]bool{4: true, 5: true}},
{5, map[uint64]bool{5: true}},
}
for _, tt := range tests {
result := group.QuorumAtLeast(tt.q)
if result.Len() != uint64(len(tt.expected)) {
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
}
for kmer := uint64(1); kmer <= 5; kmer++ {
shouldContain := tt.expected[kmer]
doesContain := result.Contains(kmer)
if shouldContain != doesContain {
t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain)
}
}
}
}
// TestQuorumExactlyBasic tests QuorumExactly basic functionality
func TestQuorumExactlyBasic(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 5)
// Setup: k-mer i appears in exactly i sets
for kmer := uint64(1); kmer <= 5; kmer++ {
for setIdx := 0; setIdx < int(kmer); setIdx++ {
group.Get(setIdx).AddKmerCode(kmer)
}
}
tests := []struct {
q int
expected []uint64
}{
{1, []uint64{1}},
{2, []uint64{2}},
{3, []uint64{3}},
{4, []uint64{4}},
{5, []uint64{5}},
}
for _, tt := range tests {
result := group.QuorumExactly(tt.q)
if result.Len() != uint64(len(tt.expected)) {
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
}
for _, kmer := range tt.expected {
if !result.Contains(kmer) {
t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
}
}
}
}
// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1)
func TestQuorumIdentity(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 4)
// Add random distribution
group.Get(0).AddKmerCode(1)
group.Get(0).AddKmerCode(2)
group.Get(0).AddKmerCode(3)
group.Get(1).AddKmerCode(2)
group.Get(1).AddKmerCode(3)
group.Get(1).AddKmerCode(4)
group.Get(2).AddKmerCode(3)
group.Get(2).AddKmerCode(4)
group.Get(3).AddKmerCode(4)
for q := 1; q <= 4; q++ {
exactly := group.QuorumExactly(q)
atLeast := group.QuorumAtLeast(q)
atLeastPlus1 := group.QuorumAtLeast(q + 1)
// Verify: every element in exactly(q) is in atLeast(q)
iter := exactly.Iterator()
for iter.HasNext() {
kmer := iter.Next()
if !atLeast.Contains(kmer) {
t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer)
}
if atLeastPlus1.Contains(kmer) {
t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer)
}
}
}
}
// TestQuorumDisjointSets tests quorum on completely disjoint sets
func TestQuorumDisjointSets(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 3)
// Each set has unique k-mers
group.Get(0).AddKmerCode(1)
group.Get(1).AddKmerCode(2)
group.Get(2).AddKmerCode(3)
// q=1 should give all
result := group.QuorumAtLeast(1)
if result.Len() != 3 {
t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len())
}
// q=2 should give none
result = group.QuorumAtLeast(2)
if result.Len() != 0 {
t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len())
}
}
// TestQuorumIdenticalSets tests quorum on identical sets
func TestQuorumIdenticalSets(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 3)
// All sets have same k-mers
for i := 0; i < 3; i++ {
group.Get(i).AddKmerCode(10)
group.Get(i).AddKmerCode(20)
group.Get(i).AddKmerCode(30)
}
// Any q <= n should give all k-mers
for q := 1; q <= 3; q++ {
result := group.QuorumAtLeast(q)
if result.Len() != 3 {
t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len())
}
}
}
// TestQuorumLargeNumbers tests with large k-mer values
func TestQuorumLargeNumbers(t *testing.T) {
k := 21
group := NewKmerSetGroup(k, 3)
// Use large uint64 values (actual k-mer encodings)
largeKmers := []uint64{
0x1234567890ABCDEF,
0xFEDCBA0987654321,
0xAAAAAAAAAAAAAAAA,
}
// Add to multiple sets
for i := 0; i < 3; i++ {
for j := 0; j <= i; j++ {
group.Get(j).AddKmerCode(largeKmers[i])
}
}
result := group.QuorumAtLeast(2)
if result.Len() != 2 {
t.Errorf("Large numbers q=2: expected 2, got %d", result.Len())
}
if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) {
t.Error("Large numbers: wrong k-mers in result")
}
}
// TestQuorumAtMostBasic tests QuorumAtMost basic functionality
func TestQuorumAtMostBasic(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 5)
// Setup: k-mer i appears in exactly i sets
for kmer := uint64(1); kmer <= 5; kmer++ {
for setIdx := 0; setIdx < int(kmer); setIdx++ {
group.Get(setIdx).AddKmerCode(kmer)
}
}
tests := []struct {
q int
expected []uint64
}{
{0, []uint64{}}, // at most 0: none
{1, []uint64{1}}, // at most 1: only k-mer 1
{2, []uint64{1, 2}}, // at most 2: k-mers 1,2
{3, []uint64{1, 2, 3}}, // at most 3: k-mers 1,2,3
{4, []uint64{1, 2, 3, 4}}, // at most 4: k-mers 1,2,3,4
{5, []uint64{1, 2, 3, 4, 5}}, // at most 5: all k-mers
{10, []uint64{1, 2, 3, 4, 5}}, // at most 10: all k-mers
}
for _, tt := range tests {
result := group.QuorumAtMost(tt.q)
if result.Len() != uint64(len(tt.expected)) {
t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len())
}
for _, kmer := range tt.expected {
if !result.Contains(kmer) {
t.Errorf("q=%d: missing k-mer %d", tt.q, kmer)
}
}
}
}
// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary
func TestQuorumComplementIdentity(t *testing.T) {
k := 5
group := NewKmerSetGroup(k, 4)
// Add random distribution
group.Get(0).AddKmerCode(1)
group.Get(0).AddKmerCode(2)
group.Get(0).AddKmerCode(3)
group.Get(1).AddKmerCode(2)
group.Get(1).AddKmerCode(3)
group.Get(1).AddKmerCode(4)
group.Get(2).AddKmerCode(3)
group.Get(2).AddKmerCode(4)
group.Get(3).AddKmerCode(4)
union := group.Union()
for q := 1; q < 4; q++ {
atMost := group.QuorumAtMost(q)
atLeast := group.QuorumAtLeast(q + 1)
// Verify: AtMost(q) AtLeast(q+1) = Union()
combined := atMost.Union(atLeast)
if combined.Len() != union.Len() {
t.Errorf("q=%d: AtMost(q) AtLeast(q+1) has %d k-mers, Union has %d",
q, combined.Len(), union.Len())
}
// Verify: AtMost(q) ∩ AtLeast(q+1) = ∅
overlap := atMost.Intersect(atLeast)
if overlap.Len() != 0 {
t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers",
q, overlap.Len())
}
}
}
// BenchmarkQuorumAtLeast benchmarks quorum operations
func BenchmarkQuorumAtLeast(b *testing.B) {
k := 21
n := 10
group := NewKmerSetGroup(k, n)
// Populate with realistic data
for i := 0; i < n; i++ {
for j := uint64(0); j < 10000; j++ {
if (j % uint64(n)) <= uint64(i) {
group.Get(i).AddKmerCode(j)
}
}
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
_ = group.QuorumAtLeast(5)
}
}

View File

@@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "09ac15a" var _Commit = "a43e625"
var _Version = "Release 4.4.0" var _Version = "Release 4.4.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.