diff --git a/pkg/obikmer/encodekmer_test.go b/pkg/obikmer/encodekmer_test.go index b228a81..7a274fd 100644 --- a/pkg/obikmer/encodekmer_test.go +++ b/pkg/obikmer/encodekmer_test.go @@ -352,8 +352,8 @@ func TestReverseComplementInvolution(t *testing.T) { } } -// TestNormalizeKmer tests the normalization function -func TestNormalizeKmer(t *testing.T) { +// TestCanonicalKmer tests the normalization function +func TestCanonicalKmer(t *testing.T) { tests := []struct { name string seq string @@ -374,7 +374,7 @@ func TestNormalizeKmer(t *testing.T) { kmer := kmers[0] rc := ReverseComplement(kmer, tt.k) - normalized := NormalizeKmer(kmer, tt.k) + normalized := CanonicalKmer(kmer, tt.k) // Normalized should be the minimum expectedNorm := kmer @@ -383,27 +383,27 @@ func TestNormalizeKmer(t *testing.T) { } if normalized != expectedNorm { - t.Errorf("NormalizeKmer(%d) = %d, want %d", kmer, normalized, expectedNorm) + t.Errorf("CanonicalKmer(%d) = %d, want %d", kmer, normalized, expectedNorm) } // Normalizing the RC should give the same result - normalizedRC := NormalizeKmer(rc, tt.k) + normalizedRC := CanonicalKmer(rc, tt.k) if normalizedRC != normalized { - t.Errorf("NormalizeKmer(RC) = %d, want %d (same as NormalizeKmer(fwd))", normalizedRC, normalized) + t.Errorf("CanonicalKmer(RC) = %d, want %d (same as CanonicalKmer(fwd))", normalizedRC, normalized) } }) } } -// TestEncodeNormalizedKmersBasic tests basic normalized k-mer encoding -func TestEncodeNormalizedKmersBasic(t *testing.T) { +// TestEncodeCanonicalKmersBasic tests basic normalized k-mer encoding +func TestEncodeCanonicalKmersBasic(t *testing.T) { // Test that a sequence and its reverse complement produce the same normalized k-mers seq := []byte("AACGTT") revComp := []byte("AACGTT") // This is a palindrome! k := 4 - kmers1 := EncodeNormalizedKmers(seq, k, nil) - kmers2 := EncodeNormalizedKmers(revComp, k, nil) + kmers1 := EncodeCanonicalKmers(seq, k, nil) + kmers2 := EncodeCanonicalKmers(revComp, k, nil) if len(kmers1) != len(kmers2) { t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) @@ -417,8 +417,8 @@ func TestEncodeNormalizedKmersBasic(t *testing.T) { } } -// TestEncodeNormalizedKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed) -func TestEncodeNormalizedKmersSymmetry(t *testing.T) { +// TestEncodeCanonicalKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed) +func TestEncodeCanonicalKmersSymmetry(t *testing.T) { // Manually construct a sequence and its reverse complement seq := []byte("ACGTAACCGG") @@ -430,8 +430,8 @@ func TestEncodeNormalizedKmersSymmetry(t *testing.T) { } k := 4 - kmers1 := EncodeNormalizedKmers(seq, k, nil) - kmers2 := EncodeNormalizedKmers(revComp, k, nil) + kmers1 := EncodeCanonicalKmers(seq, k, nil) + kmers2 := EncodeCanonicalKmers(revComp, k, nil) if len(kmers1) != len(kmers2) { t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) @@ -446,14 +446,14 @@ func TestEncodeNormalizedKmersSymmetry(t *testing.T) { } } -// TestEncodeNormalizedKmersConsistency verifies normalized k-mers match manual normalization -func TestEncodeNormalizedKmersConsistency(t *testing.T) { +// TestEncodeCanonicalKmersConsistency verifies normalized k-mers match manual normalization +func TestEncodeCanonicalKmersConsistency(t *testing.T) { seq := []byte("ACGTACGTACGTACGT") k := 8 // Get k-mers both ways rawKmers := EncodeKmers(seq, k, nil) - normalizedKmers := EncodeNormalizedKmers(seq, k, nil) + normalizedKmers := EncodeCanonicalKmers(seq, k, nil) if len(rawKmers) != len(normalizedKmers) { t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers)) @@ -461,16 +461,16 @@ func TestEncodeNormalizedKmersConsistency(t *testing.T) { // Verify each normalized k-mer matches manual normalization for i, raw := range rawKmers { - expected := NormalizeKmer(raw, k) + expected := CanonicalKmer(raw, k) if normalizedKmers[i] != expected { - t.Errorf("position %d: EncodeNormalizedKmers gave %d, NormalizeKmer gave %d", + t.Errorf("position %d: EncodeCanonicalKmers gave %d, CanonicalKmer gave %d", i, normalizedKmers[i], expected) } } } -// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function -func BenchmarkEncodeNormalizedKmers(b *testing.B) { +// BenchmarkEncodeCanonicalKmers benchmarks the normalized encoding function +func BenchmarkEncodeCanonicalKmers(b *testing.B) { sizes := []int{100, 1000, 10000, 100000} kSizes := []int{8, 16, 31} @@ -488,7 +488,7 @@ func BenchmarkEncodeNormalizedKmers(b *testing.B) { b.SetBytes(int64(size)) for i := 0; i < b.N; i++ { - EncodeNormalizedKmers(seq, k, &buffer) + EncodeCanonicalKmers(seq, k, &buffer) } }) } @@ -506,14 +506,14 @@ func BenchmarkReverseComplement(b *testing.B) { } } -// BenchmarkNormalizeKmer benchmarks the normalization function -func BenchmarkNormalizeKmer(b *testing.B) { +// BenchmarkCanonicalKmer benchmarks the normalization function +func BenchmarkCanonicalKmer(b *testing.B) { kmer := uint64(0x06C6C6C6C6C6C6C6) k := 31 b.ResetTimer() for i := 0; i < b.N; i++ { - NormalizeKmer(kmer, k) + CanonicalKmer(kmer, k) } } @@ -730,7 +730,7 @@ func TestExtractSuperKmersCanonical(t *testing.T) { for i, sk := range result { // Verify the minimizer is indeed canonical (equal to its normalized form) - normalized := NormalizeKmer(sk.Minimizer, m) + normalized := CanonicalKmer(sk.Minimizer, m) if sk.Minimizer != normalized { t.Errorf("super k-mer %d: minimizer %d is not canonical (normalized: %d)", i, sk.Minimizer, normalized) @@ -886,8 +886,8 @@ func TestKmerErrorMarkersWithRealKmers(t *testing.T) { } // Verify normalization works with error bits cleared - normalized1 := NormalizeKmer(originalKmer, k) - normalized2 := NormalizeKmer(ClearKmerError(marked), k) + normalized1 := CanonicalKmer(originalKmer, k) + normalized2 := CanonicalKmer(ClearKmerError(marked), k) if normalized1 != normalized2 { t.Errorf("Normalization affected by error bits") } @@ -977,8 +977,8 @@ func TestReverseComplementPreservesErrorBits(t *testing.T) { } } -// TestNormalizeKmerWithErrorBits tests that NormalizeKmer works with error bits -func TestNormalizeKmerWithErrorBits(t *testing.T) { +// TestCanonicalKmerWithErrorBits tests that CanonicalKmer works with error bits +func TestCanonicalKmerWithErrorBits(t *testing.T) { seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") k := 31 @@ -995,7 +995,7 @@ func TestNormalizeKmerWithErrorBits(t *testing.T) { marked := SetKmerError(originalKmer, errCode) // Normalize should work on the sequence part - normalized := NormalizeKmer(marked, k) + normalized := CanonicalKmer(marked, k) // Error bits should be preserved if GetKmerError(normalized) != errCode { @@ -1004,7 +1004,7 @@ func TestNormalizeKmerWithErrorBits(t *testing.T) { // The sequence part should be normalized cleanNormalized := ClearKmerError(normalized) - expectedNormalized := NormalizeKmer(ClearKmerError(marked), k) + expectedNormalized := CanonicalKmer(ClearKmerError(marked), k) if cleanNormalized != expectedNormalized { t.Errorf("Normalization incorrect with error bits present") @@ -1081,19 +1081,19 @@ func TestIterKmers(t *testing.T) { } } -// TestIterNormalizedKmers tests the normalized k-mer iterator -func TestIterNormalizedKmers(t *testing.T) { +// TestIterCanonicalKmers tests the normalized k-mer iterator +func TestIterCanonicalKmers(t *testing.T) { seq := []byte("ACGTACGTACGT") k := 6 // Collect k-mers via iterator var iterKmers []uint64 - for kmer := range IterNormalizedKmers(seq, k) { + for kmer := range IterCanonicalKmers(seq, k) { iterKmers = append(iterKmers, kmer) } // Compare with slice-based version - sliceKmers := EncodeNormalizedKmers(seq, k, nil) + sliceKmers := EncodeCanonicalKmers(seq, k, nil) if len(iterKmers) != len(sliceKmers) { t.Errorf("length mismatch: iter=%d, slice=%d", len(iterKmers), len(sliceKmers)) @@ -1151,8 +1151,8 @@ func BenchmarkIterKmers(b *testing.B) { }) } -// BenchmarkIterNormalizedKmers benchmarks the normalized iterator -func BenchmarkIterNormalizedKmers(b *testing.B) { +// BenchmarkIterCanonicalKmers benchmarks the normalized iterator +func BenchmarkIterCanonicalKmers(b *testing.B) { seq := make([]byte, 10000) for i := range seq { seq[i] = "ACGT"[i%4] @@ -1163,7 +1163,7 @@ func BenchmarkIterNormalizedKmers(b *testing.B) { b.ResetTimer() for i := 0; i < b.N; i++ { count := 0 - for range IterNormalizedKmers(seq, k) { + for range IterCanonicalKmers(seq, k) { count++ } } @@ -1173,7 +1173,7 @@ func BenchmarkIterNormalizedKmers(b *testing.B) { var buffer []uint64 b.ResetTimer() for i := 0; i < b.N; i++ { - buffer = EncodeNormalizedKmers(seq, k, &buffer) + buffer = EncodeCanonicalKmers(seq, k, &buffer) } }) } diff --git a/pkg/obikmer/kmer_set_group_quorum.go b/pkg/obikmer/kmer_set_group_quorum.go new file mode 100644 index 0000000..4f21f95 --- /dev/null +++ b/pkg/obikmer/kmer_set_group_quorum.go @@ -0,0 +1,235 @@ +package obikmer + +import ( + "container/heap" + + "github.com/RoaringBitmap/roaring/roaring64" +) + +// heapItem represents an element in the min-heap for k-way merge +type heapItem struct { + value uint64 + idx int +} + +// kmerMinHeap implements heap.Interface for k-way merge algorithm +type kmerMinHeap []heapItem + +func (h kmerMinHeap) Len() int { return len(h) } +func (h kmerMinHeap) Less(i, j int) bool { return h[i].value < h[j].value } +func (h kmerMinHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +func (h *kmerMinHeap) Push(x interface{}) { + *h = append(*h, x.(heapItem)) +} + +func (h *kmerMinHeap) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// QuorumAtLeast returns k-mers present in at least q sets +// +// Algorithm: K-way merge with min-heap counting +// +// The algorithm processes all k-mers in sorted order using a min-heap: +// +// 1. Initialize one iterator per non-empty set +// 2. Build a min-heap of (value, set_index) pairs, one per iterator +// 3. While heap is not empty: +// a. Extract the minimum value v from heap +// b. Pop ALL heap items with value == v (counting occurrences) +// c. If count >= q, add v to result +// d. Advance each popped iterator and re-insert into heap if valid +// +// This ensures each unique k-mer is counted exactly once across all sets. +// +// Time complexity: O(M log N) +// - M = sum of all set cardinalities (total k-mer occurrences) +// - N = number of sets +// - Each k-mer occurrence is inserted/extracted from heap once: O(M) operations +// - Each heap operation costs O(log N) +// +// Space complexity: O(N) +// - Heap contains at most N elements (one per set iterator) +// - Output bitmap size depends on quorum result +// +// Special cases (optimized): +// - q <= 0: returns empty set +// - q == 1: delegates to Union() (native OR operations) +// - q == n: delegates to Intersect() (native AND operations) +// - q > n: returns empty set (impossible to satisfy) +func (ksg *KmerSetGroup) QuorumAtLeast(q int) *KmerSet { + n := len(ksg.sets) + + // Edge cases + if q <= 0 || n == 0 { + return NewKmerSet(ksg.k) + } + if q > n { + return NewKmerSet(ksg.k) + } + if q == 1 { + return ksg.Union() + } + if q == n { + return ksg.Intersect() + } + + // Initialize iterators for all non-empty sets + iterators := make([]roaring64.IntIterable64, 0, n) + iterIndices := make([]int, 0, n) + + for i, set := range ksg.sets { + if set.Len() > 0 { + iter := set.bitmap.Iterator() + if iter.HasNext() { + iterators = append(iterators, iter) + iterIndices = append(iterIndices, i) + } + } + } + + if len(iterators) == 0 { + return NewKmerSet(ksg.k) + } + + // Initialize heap with first value from each iterator + h := make(kmerMinHeap, len(iterators)) + for i, iter := range iterators { + h[i] = heapItem{value: iter.Next(), idx: i} + } + heap.Init(&h) + + // Result bitmap + result := roaring64.New() + + // K-way merge with counting + for len(h) > 0 { + minVal := h[0].value + count := 0 + activeIndices := make([]int, 0, len(h)) + + // Pop all elements with same value (count occurrences) + for len(h) > 0 && h[0].value == minVal { + item := heap.Pop(&h).(heapItem) + count++ + activeIndices = append(activeIndices, item.idx) + } + + // Add to result if quorum reached + if count >= q { + result.Add(minVal) + } + + // Advance iterators and re-insert into heap + for _, iterIdx := range activeIndices { + if iterators[iterIdx].HasNext() { + heap.Push(&h, heapItem{ + value: iterators[iterIdx].Next(), + idx: iterIdx, + }) + } + } + } + + return NewKmerSetFromBitmap(ksg.k, result) +} + +// QuorumAtMost returns k-mers present in at most q sets +// +// Algorithm: Uses the mathematical identity +// AtMost(q) = Union() - AtLeast(q+1) +// +// Proof: +// - Union() contains all k-mers present in at least 1 set +// - AtLeast(q+1) contains all k-mers present in q+1 or more sets +// - Their difference contains only k-mers present in at most q sets +// +// Implementation: +// 1. Compute U = Union() +// 2. Compute A = QuorumAtLeast(q+1) +// 3. Return U - A using bitmap AndNot operation +// +// Time complexity: O(M log N) +// - Union(): O(M) with native OR operations +// - QuorumAtLeast(q+1): O(M log N) +// - AndNot: O(|U|) where |U| <= M +// - Total: O(M log N) +// +// Space complexity: O(N) +// - Inherited from QuorumAtLeast heap +// +// Special cases: +// - q <= 0: returns empty set +// - q >= n: returns Union() (all k-mers are in at most n sets) +func (ksg *KmerSetGroup) QuorumAtMost(q int) *KmerSet { + n := len(ksg.sets) + + // Edge cases + if q <= 0 { + return NewKmerSet(ksg.k) + } + if q >= n { + return ksg.Union() + } + + // Compute Union() - AtLeast(q+1) + union := ksg.Union() + atLeastQ1 := ksg.QuorumAtLeast(q + 1) + + // Difference: elements in union but not in atLeastQ1 + result := union.bitmap.Clone() + result.AndNot(atLeastQ1.bitmap) + + return NewKmerSetFromBitmap(ksg.k, result) +} + +// QuorumExactly returns k-mers present in exactly q sets +// +// Algorithm: Uses the mathematical identity +// Exactly(q) = AtLeast(q) - AtLeast(q+1) +// +// Proof: +// - AtLeast(q) contains all k-mers present in q or more sets +// - AtLeast(q+1) contains all k-mers present in q+1 or more sets +// - Their difference contains only k-mers present in exactly q sets +// +// Implementation: +// 1. Compute A = QuorumAtLeast(q) +// 2. Compute B = QuorumAtLeast(q+1) +// 3. Return A - B using bitmap AndNot operation +// +// Time complexity: O(M log N) +// - Two calls to QuorumAtLeast: 2 * O(M log N) +// - One AndNot operation: O(|A|) where |A| <= M +// - Total: O(M log N) since AndNot is dominated by merge operations +// +// Space complexity: O(N) +// - Inherited from QuorumAtLeast heap +// - Two temporary bitmaps for intermediate results +// +// Special cases: +// - q <= 0: returns empty set +// - q > n: returns empty set (impossible to have k-mer in more than n sets) +func (ksg *KmerSetGroup) QuorumExactly(q int) *KmerSet { + n := len(ksg.sets) + + // Edge cases + if q <= 0 || q > n { + return NewKmerSet(ksg.k) + } + + // Compute AtLeast(q) - AtLeast(q+1) + aq := ksg.QuorumAtLeast(q) + aq1 := ksg.QuorumAtLeast(q + 1) + + // Difference: elements in aq but not in aq1 + result := aq.bitmap.Clone() + result.AndNot(aq1.bitmap) + + return NewKmerSetFromBitmap(ksg.k, result) +} diff --git a/pkg/obikmer/kmer_set_group_quorum_test.go b/pkg/obikmer/kmer_set_group_quorum_test.go new file mode 100644 index 0000000..ab11319 --- /dev/null +++ b/pkg/obikmer/kmer_set_group_quorum_test.go @@ -0,0 +1,395 @@ +package obikmer + +import ( + "testing" +) + +// TestQuorumAtLeastEdgeCases tests edge cases for QuorumAtLeast +func TestQuorumAtLeastEdgeCases(t *testing.T) { + k := 5 + + // Test group with all empty sets + emptyGroup := NewKmerSetGroup(k, 3) + result := emptyGroup.QuorumAtLeast(1) + if result.Len() != 0 { + t.Errorf("Empty sets: expected 0 k-mers, got %d", result.Len()) + } + + // Test q <= 0 + group := NewKmerSetGroup(k, 3) + result = group.QuorumAtLeast(0) + if result.Len() != 0 { + t.Errorf("q=0: expected 0 k-mers, got %d", result.Len()) + } + + result = group.QuorumAtLeast(-1) + if result.Len() != 0 { + t.Errorf("q=-1: expected 0 k-mers, got %d", result.Len()) + } + + // Test q > n + group.Get(0).AddKmerCode(1) + result = group.QuorumAtLeast(10) + if result.Len() != 0 { + t.Errorf("q>n: expected 0 k-mers, got %d", result.Len()) + } +} + +// TestQuorumAtLeastQ1 tests q=1 (should equal Union) +func TestQuorumAtLeastQ1(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 3) + + // Add different k-mers to each set + group.Get(0).AddKmerCode(1) + group.Get(0).AddKmerCode(2) + group.Get(1).AddKmerCode(2) + group.Get(1).AddKmerCode(3) + group.Get(2).AddKmerCode(3) + group.Get(2).AddKmerCode(4) + + quorum := group.QuorumAtLeast(1) + union := group.Union() + + if quorum.Len() != union.Len() { + t.Errorf("QuorumAtLeast(1) length %d != Union length %d", quorum.Len(), union.Len()) + } + + // Check all elements match + for kmer := uint64(1); kmer <= 4; kmer++ { + if quorum.Contains(kmer) != union.Contains(kmer) { + t.Errorf("Mismatch for k-mer %d", kmer) + } + } +} + +// TestQuorumAtLeastQN tests q=n (should equal Intersect) +func TestQuorumAtLeastQN(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 3) + + // Add some common k-mers and some unique + for i := 0; i < 3; i++ { + group.Get(i).AddKmerCode(10) // common to all + group.Get(i).AddKmerCode(20) // common to all + } + group.Get(0).AddKmerCode(1) // unique to set 0 + group.Get(1).AddKmerCode(2) // unique to set 1 + + quorum := group.QuorumAtLeast(3) + intersect := group.Intersect() + + if quorum.Len() != intersect.Len() { + t.Errorf("QuorumAtLeast(n) length %d != Intersect length %d", quorum.Len(), intersect.Len()) + } + + if quorum.Len() != 2 { + t.Errorf("Expected 2 common k-mers, got %d", quorum.Len()) + } + + if !quorum.Contains(10) || !quorum.Contains(20) { + t.Error("Missing common k-mers") + } + + if quorum.Contains(1) || quorum.Contains(2) { + t.Error("Unique k-mers should not be in result") + } +} + +// TestQuorumAtLeastGeneral tests general quorum values +func TestQuorumAtLeastGeneral(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 5) + + // Setup: k-mer i appears in i sets (for i=1..5) + // k-mer 1: in set 0 + // k-mer 2: in sets 0,1 + // k-mer 3: in sets 0,1,2 + // k-mer 4: in sets 0,1,2,3 + // k-mer 5: in sets 0,1,2,3,4 (all) + + for kmer := uint64(1); kmer <= 5; kmer++ { + for setIdx := 0; setIdx < int(kmer); setIdx++ { + group.Get(setIdx).AddKmerCode(kmer) + } + } + + tests := []struct { + q int + expected map[uint64]bool + }{ + {1, map[uint64]bool{1: true, 2: true, 3: true, 4: true, 5: true}}, + {2, map[uint64]bool{2: true, 3: true, 4: true, 5: true}}, + {3, map[uint64]bool{3: true, 4: true, 5: true}}, + {4, map[uint64]bool{4: true, 5: true}}, + {5, map[uint64]bool{5: true}}, + } + + for _, tt := range tests { + result := group.QuorumAtLeast(tt.q) + + if result.Len() != uint64(len(tt.expected)) { + t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) + } + + for kmer := uint64(1); kmer <= 5; kmer++ { + shouldContain := tt.expected[kmer] + doesContain := result.Contains(kmer) + if shouldContain != doesContain { + t.Errorf("q=%d, k-mer=%d: expected contains=%v, got %v", tt.q, kmer, shouldContain, doesContain) + } + } + } +} + +// TestQuorumExactlyBasic tests QuorumExactly basic functionality +func TestQuorumExactlyBasic(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 5) + + // Setup: k-mer i appears in exactly i sets + for kmer := uint64(1); kmer <= 5; kmer++ { + for setIdx := 0; setIdx < int(kmer); setIdx++ { + group.Get(setIdx).AddKmerCode(kmer) + } + } + + tests := []struct { + q int + expected []uint64 + }{ + {1, []uint64{1}}, + {2, []uint64{2}}, + {3, []uint64{3}}, + {4, []uint64{4}}, + {5, []uint64{5}}, + } + + for _, tt := range tests { + result := group.QuorumExactly(tt.q) + + if result.Len() != uint64(len(tt.expected)) { + t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) + } + + for _, kmer := range tt.expected { + if !result.Contains(kmer) { + t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) + } + } + } +} + +// TestQuorumIdentity tests the mathematical identity: Exactly(q) = AtLeast(q) - AtLeast(q+1) +func TestQuorumIdentity(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 4) + + // Add random distribution + group.Get(0).AddKmerCode(1) + group.Get(0).AddKmerCode(2) + group.Get(0).AddKmerCode(3) + + group.Get(1).AddKmerCode(2) + group.Get(1).AddKmerCode(3) + group.Get(1).AddKmerCode(4) + + group.Get(2).AddKmerCode(3) + group.Get(2).AddKmerCode(4) + + group.Get(3).AddKmerCode(4) + + for q := 1; q <= 4; q++ { + exactly := group.QuorumExactly(q) + atLeast := group.QuorumAtLeast(q) + atLeastPlus1 := group.QuorumAtLeast(q + 1) + + // Verify: every element in exactly(q) is in atLeast(q) + iter := exactly.Iterator() + for iter.HasNext() { + kmer := iter.Next() + if !atLeast.Contains(kmer) { + t.Errorf("q=%d: k-mer %d in Exactly but not in AtLeast", q, kmer) + } + if atLeastPlus1.Contains(kmer) { + t.Errorf("q=%d: k-mer %d in Exactly but also in AtLeast(q+1)", q, kmer) + } + } + } +} + +// TestQuorumDisjointSets tests quorum on completely disjoint sets +func TestQuorumDisjointSets(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 3) + + // Each set has unique k-mers + group.Get(0).AddKmerCode(1) + group.Get(1).AddKmerCode(2) + group.Get(2).AddKmerCode(3) + + // q=1 should give all + result := group.QuorumAtLeast(1) + if result.Len() != 3 { + t.Errorf("Disjoint sets q=1: expected 3, got %d", result.Len()) + } + + // q=2 should give none + result = group.QuorumAtLeast(2) + if result.Len() != 0 { + t.Errorf("Disjoint sets q=2: expected 0, got %d", result.Len()) + } +} + +// TestQuorumIdenticalSets tests quorum on identical sets +func TestQuorumIdenticalSets(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 3) + + // All sets have same k-mers + for i := 0; i < 3; i++ { + group.Get(i).AddKmerCode(10) + group.Get(i).AddKmerCode(20) + group.Get(i).AddKmerCode(30) + } + + // Any q <= n should give all k-mers + for q := 1; q <= 3; q++ { + result := group.QuorumAtLeast(q) + if result.Len() != 3 { + t.Errorf("Identical sets q=%d: expected 3, got %d", q, result.Len()) + } + } +} + +// TestQuorumLargeNumbers tests with large k-mer values +func TestQuorumLargeNumbers(t *testing.T) { + k := 21 + group := NewKmerSetGroup(k, 3) + + // Use large uint64 values (actual k-mer encodings) + largeKmers := []uint64{ + 0x1234567890ABCDEF, + 0xFEDCBA0987654321, + 0xAAAAAAAAAAAAAAAA, + } + + // Add to multiple sets + for i := 0; i < 3; i++ { + for j := 0; j <= i; j++ { + group.Get(j).AddKmerCode(largeKmers[i]) + } + } + + result := group.QuorumAtLeast(2) + if result.Len() != 2 { + t.Errorf("Large numbers q=2: expected 2, got %d", result.Len()) + } + + if !result.Contains(largeKmers[1]) || !result.Contains(largeKmers[2]) { + t.Error("Large numbers: wrong k-mers in result") + } +} + +// TestQuorumAtMostBasic tests QuorumAtMost basic functionality +func TestQuorumAtMostBasic(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 5) + + // Setup: k-mer i appears in exactly i sets + for kmer := uint64(1); kmer <= 5; kmer++ { + for setIdx := 0; setIdx < int(kmer); setIdx++ { + group.Get(setIdx).AddKmerCode(kmer) + } + } + + tests := []struct { + q int + expected []uint64 + }{ + {0, []uint64{}}, // at most 0: none + {1, []uint64{1}}, // at most 1: only k-mer 1 + {2, []uint64{1, 2}}, // at most 2: k-mers 1,2 + {3, []uint64{1, 2, 3}}, // at most 3: k-mers 1,2,3 + {4, []uint64{1, 2, 3, 4}}, // at most 4: k-mers 1,2,3,4 + {5, []uint64{1, 2, 3, 4, 5}}, // at most 5: all k-mers + {10, []uint64{1, 2, 3, 4, 5}}, // at most 10: all k-mers + } + + for _, tt := range tests { + result := group.QuorumAtMost(tt.q) + + if result.Len() != uint64(len(tt.expected)) { + t.Errorf("q=%d: expected %d k-mers, got %d", tt.q, len(tt.expected), result.Len()) + } + + for _, kmer := range tt.expected { + if !result.Contains(kmer) { + t.Errorf("q=%d: missing k-mer %d", tt.q, kmer) + } + } + } +} + +// TestQuorumComplementIdentity tests that AtLeast and AtMost are complementary +func TestQuorumComplementIdentity(t *testing.T) { + k := 5 + group := NewKmerSetGroup(k, 4) + + // Add random distribution + group.Get(0).AddKmerCode(1) + group.Get(0).AddKmerCode(2) + group.Get(0).AddKmerCode(3) + + group.Get(1).AddKmerCode(2) + group.Get(1).AddKmerCode(3) + group.Get(1).AddKmerCode(4) + + group.Get(2).AddKmerCode(3) + group.Get(2).AddKmerCode(4) + + group.Get(3).AddKmerCode(4) + + union := group.Union() + + for q := 1; q < 4; q++ { + atMost := group.QuorumAtMost(q) + atLeast := group.QuorumAtLeast(q + 1) + + // Verify: AtMost(q) ∪ AtLeast(q+1) = Union() + combined := atMost.Union(atLeast) + + if combined.Len() != union.Len() { + t.Errorf("q=%d: AtMost(q) ∪ AtLeast(q+1) has %d k-mers, Union has %d", + q, combined.Len(), union.Len()) + } + + // Verify: AtMost(q) ∩ AtLeast(q+1) = ∅ + overlap := atMost.Intersect(atLeast) + if overlap.Len() != 0 { + t.Errorf("q=%d: AtMost(q) and AtLeast(q+1) overlap with %d k-mers", + q, overlap.Len()) + } + } +} + +// BenchmarkQuorumAtLeast benchmarks quorum operations +func BenchmarkQuorumAtLeast(b *testing.B) { + k := 21 + n := 10 + group := NewKmerSetGroup(k, n) + + // Populate with realistic data + for i := 0; i < n; i++ { + for j := uint64(0); j < 10000; j++ { + if (j % uint64(n)) <= uint64(i) { + group.Get(i).AddKmerCode(j) + } + } + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = group.QuorumAtLeast(5) + } +} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 2fa10ac..089e158 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "09ac15a" +var _Commit = "a43e625" var _Version = "Release 4.4.0" // Version returns the version of the obitools package.