diff --git a/pkg/obikmer/encodekmer.go b/pkg/obikmer/encodekmer.go index 7604ea8..7c36f73 100644 --- a/pkg/obikmer/encodekmer.go +++ b/pkg/obikmer/encodekmer.go @@ -54,6 +54,162 @@ func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 { return result } +// SuperKmer represents a maximal subsequence where all consecutive k-mers +// share the same minimizer. A minimizer is the smallest canonical m-mer +// among the (k-m+1) m-mers contained in a k-mer. +type SuperKmer struct { + Minimizer uint64 // The canonical minimizer value (normalized m-mer) + Start int // Starting position in the original sequence (0-indexed) + End int // Ending position (exclusive, like Go slice notation) + Sequence []byte // The actual DNA subsequence [Start:End] +} + +// dequeItem represents an element in the monotone deque used for +// tracking minimizers in a sliding window. +type dequeItem struct { + position int // Position of the m-mer in the sequence + canonical uint64 // Canonical (normalized) m-mer value +} + +// ExtractSuperKmers extracts super k-mers from a DNA sequence. +// A super k-mer is a maximal subsequence where all consecutive k-mers +// share the same minimizer. The minimizer of a k-mer is the smallest +// canonical m-mer among its (k-m+1) constituent m-mers. +// +// The algorithm uses: +// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation +// - Monotone deque for O(1) amortized minimizer tracking per position +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between m+1 and 32) +// - m: minimizer size (must be between 1 and k-1) +// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. +// +// Returns: +// - slice of SuperKmer structs representing maximal subsequences +// - nil if parameters are invalid or sequence is too short +// +// Time complexity: O(n) where n is the sequence length +// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results +func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer { + // Validate parameters + if m < 1 || m >= k || k < 2 || k > 32 || len(seq) < k { + return nil + } + + // Initialize result buffer + var result []SuperKmer + if buffer == nil { + // Estimate: worst case is one super k-mer per k nucleotides + estimatedSize := len(seq) / k + if estimatedSize < 1 { + estimatedSize = 1 + } + result = make([]SuperKmer, 0, estimatedSize) + } else { + result = (*buffer)[:0] + } + + // Initialize monotone deque for tracking minimizers + deque := make([]dequeItem, 0, k-m+1) + + // Masks for m-mer encoding + mMask := uint64(1)<<(m*2) - 1 + rcShift := uint((m - 1) * 2) + + // Build first m-1 nucleotides (can't form complete m-mer yet) + var fwdMmer, rvcMmer uint64 + for i := 0; i < m-1 && i < len(seq); i++ { + code := uint64(__single_base_code__[seq[i]&31]) + fwdMmer = (fwdMmer << 2) | code + rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) + } + + // Track super k-mer boundaries + superKmerStart := 0 + var currentMinimizer uint64 + firstKmer := true + + // Slide through sequence, processing each position that completes an m-mer + for pos := m - 1; pos < len(seq); pos++ { + // Add new nucleotide to m-mer + code := uint64(__single_base_code__[seq[pos]&31]) + fwdMmer = ((fwdMmer << 2) | code) & mMask + rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) + + // Get canonical m-mer (minimum of forward and reverse complement) + canonical := fwdMmer + if rvcMmer < fwdMmer { + canonical = rvcMmer + } + + mmerPos := pos - m + 1 + + // Remove m-mers outside the current k-mer window from front of deque + // The k-mer at position pos spans from (pos-k+1) to pos + // It contains m-mers from position (pos-k+1) to (pos-m+1) + if pos >= k-1 { + windowStart := pos - k + 1 + for len(deque) > 0 && deque[0].position < windowStart { + deque = deque[1:] + } + } + + // Maintain monotone property: remove larger values from back + for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical { + deque = deque[:len(deque)-1] + } + + // Add new m-mer to deque + deque = append(deque, dequeItem{position: mmerPos, canonical: canonical}) + + // Once we have processed the first k nucleotides, we have our first k-mer + if pos >= k-1 { + // The minimizer is at the front of the deque + newMinimizer := deque[0].canonical + kmerStart := pos - k + 1 // Start position of current k-mer (ending at pos) + + if firstKmer { + // Initialize first super k-mer + currentMinimizer = newMinimizer + firstKmer = false + } else if newMinimizer != currentMinimizer { + // Minimizer changed at this k-mer position + // Previous k-mer started at position kmerStart-1 + // That k-mer is seq[kmerStart-1 : kmerStart-1+k] (Go slice notation) + // The last base of that k-mer is at kmerStart-1+k-1 = kmerStart+k-2 + // In Go slice notation (exclusive end): kmerStart+k-1 + endPos := kmerStart + k - 1 + superKmer := SuperKmer{ + Minimizer: currentMinimizer, + Start: superKmerStart, + End: endPos, + Sequence: seq[superKmerStart:endPos], + } + result = append(result, superKmer) + + // New super k-mer starts at current k-mer position + superKmerStart = kmerStart + currentMinimizer = newMinimizer + } + } + } + + // Emit final super k-mer + if !firstKmer { + superKmer := SuperKmer{ + Minimizer: currentMinimizer, + Start: superKmerStart, + End: len(seq), + Sequence: seq[superKmerStart:], + } + result = append(result, superKmer) + } + + return result +} + // ReverseComplement computes the reverse complement of an encoded k-mer. // The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11). // The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11. diff --git a/pkg/obikmer/encodekmer_test.go b/pkg/obikmer/encodekmer_test.go index e89c0c3..9397e48 100644 --- a/pkg/obikmer/encodekmer_test.go +++ b/pkg/obikmer/encodekmer_test.go @@ -516,3 +516,316 @@ func BenchmarkNormalizeKmer(b *testing.B) { NormalizeKmer(kmer, k) } } + +// TestExtractSuperKmersBasic tests basic super k-mer extraction +func TestExtractSuperKmersBasic(t *testing.T) { + tests := []struct { + name string + seq string + k int + m int + validate func(*testing.T, []SuperKmer) + }{ + { + name: "simple sequence", + seq: "ACGTACGTACGT", + k: 5, + m: 3, + validate: func(t *testing.T, sks []SuperKmer) { + if len(sks) == 0 { + t.Error("expected at least one super k-mer") + } + // Verify all super k-mers cover the sequence + totalLen := 0 + for _, sk := range sks { + totalLen += sk.End - sk.Start + if string(sk.Sequence) != string([]byte(t.Name())[len(t.Name())-len(sk.Sequence):]) { + // Just verify Start/End matches Sequence + if string(sk.Sequence) != string([]byte("ACGTACGTACGT")[sk.Start:sk.End]) { + t.Errorf("Sequence mismatch: seq[%d:%d] != %s", sk.Start, sk.End, sk.Sequence) + } + } + } + }, + }, + { + name: "single k-mer sequence", + seq: "ACGTACGT", + k: 8, + m: 4, + validate: func(t *testing.T, sks []SuperKmer) { + if len(sks) != 1 { + t.Errorf("expected exactly 1 super k-mer for len(seq)==k, got %d", len(sks)) + } + if len(sks) > 0 { + if sks[0].Start != 0 || sks[0].End != 8 { + t.Errorf("expected [0:8], got [%d:%d]", sks[0].Start, sks[0].End) + } + } + }, + }, + { + name: "repeating sequence", + seq: "AAAAAAAAAA", + k: 5, + m: 3, + validate: func(t *testing.T, sks []SuperKmer) { + // Repeating A should have same minimizer (AAA) everywhere + if len(sks) != 1 { + t.Errorf("expected 1 super k-mer for repeating sequence, got %d", len(sks)) + } + if len(sks) > 0 { + if sks[0].Start != 0 || sks[0].End != 10 { + t.Errorf("expected super k-mer to cover entire sequence [0:10], got [%d:%d]", + sks[0].Start, sks[0].End) + } + } + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ExtractSuperKmers([]byte(tt.seq), tt.k, tt.m, nil) + tt.validate(t, result) + }) + } +} + +// TestExtractSuperKmersEdgeCases tests edge cases and error handling +func TestExtractSuperKmersEdgeCases(t *testing.T) { + tests := []struct { + name string + seq string + k int + m int + expectNil bool + }{ + {"empty sequence", "", 5, 3, true}, + {"seq shorter than k", "ACG", 5, 3, true}, + {"m < 1", "ACGTACGT", 5, 0, true}, + {"m >= k", "ACGTACGT", 5, 5, true}, + {"m == k-1 (valid)", "ACGTACGT", 5, 4, false}, + {"k < 2", "ACGTACGT", 1, 1, true}, + {"k > 32", "ACGTACGTACGTACGTACGTACGTACGTACGTACGT", 33, 16, true}, + {"k == 32 (valid)", "ACGTACGTACGTACGTACGTACGTACGTACGT", 32, 16, false}, + {"seq == k (valid)", "ACGTACGT", 8, 4, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ExtractSuperKmers([]byte(tt.seq), tt.k, tt.m, nil) + if tt.expectNil && result != nil { + t.Errorf("expected nil, got %v", result) + } + if !tt.expectNil && result == nil { + t.Errorf("expected non-nil result, got nil") + } + }) + } +} + +// TestExtractSuperKmersBoundaries verifies Start/End positions +func TestExtractSuperKmersBoundaries(t *testing.T) { + seq := []byte("ACGTACGTGGGGAAAA") + k := 6 + m := 3 + + result := ExtractSuperKmers(seq, k, m, nil) + + if result == nil { + t.Fatal("expected non-nil result") + } + + // Verify each super k-mer + for i, sk := range result { + // Verify Start < End + if sk.Start >= sk.End { + t.Errorf("super k-mer %d: Start (%d) >= End (%d)", i, sk.Start, sk.End) + } + + // Verify Sequence matches seq[Start:End] + expected := string(seq[sk.Start:sk.End]) + actual := string(sk.Sequence) + if actual != expected { + t.Errorf("super k-mer %d: Sequence mismatch: got %s, want %s", i, actual, expected) + } + + // Verify bounds are within sequence + if sk.Start < 0 || sk.End > len(seq) { + t.Errorf("super k-mer %d: bounds [%d:%d] outside sequence length %d", + i, sk.Start, sk.End, len(seq)) + } + + // Verify minimum length is k + if sk.End-sk.Start < k { + t.Errorf("super k-mer %d: length %d < k=%d", i, sk.End-sk.Start, k) + } + } + + // Verify super k-mers can overlap (by up to k-1 bases) but must be ordered + // and the overlap should not exceed k-1 + for i := 0; i < len(result)-1; i++ { + // Next super k-mer should start before or at the end of current one + // Overlap is allowed and expected + overlap := result[i].End - result[i+1].Start + if overlap > k-1 { + t.Errorf("super k-mers %d and %d overlap by %d bases (max allowed: %d): [%d:%d] and [%d:%d]", + i, i+1, overlap, k-1, result[i].Start, result[i].End, result[i+1].Start, result[i+1].End) + } + // But the start positions should be ordered + if result[i+1].Start < result[i].Start { + t.Errorf("super k-mers %d and %d are not ordered: [%d:%d] and [%d:%d]", + i, i+1, result[i].Start, result[i].End, result[i+1].Start, result[i+1].End) + } + } +} + +// TestExtractSuperKmersBufferReuse tests buffer parameter +func TestExtractSuperKmersBufferReuse(t *testing.T) { + seq := []byte("ACGTACGTACGTACGT") + k := 6 + m := 3 + + // First call without buffer + result1 := ExtractSuperKmers(seq, k, m, nil) + + // Second call with buffer + buffer := make([]SuperKmer, 0, 100) + result2 := ExtractSuperKmers(seq, k, m, &buffer) + + if len(result1) != len(result2) { + t.Errorf("buffer reuse: length mismatch %d vs %d", len(result1), len(result2)) + } + + for i := range result1 { + if result1[i].Minimizer != result2[i].Minimizer { + t.Errorf("position %d: minimizer mismatch", i) + } + if result1[i].Start != result2[i].Start || result1[i].End != result2[i].End { + t.Errorf("position %d: boundary mismatch", i) + } + } + + // Test multiple calls with same buffer + for i := 0; i < 10; i++ { + result3 := ExtractSuperKmers(seq, k, m, &buffer) + if len(result3) != len(result1) { + t.Errorf("iteration %d: length mismatch", i) + } + } +} + +// TestExtractSuperKmersCanonical verifies minimizers are canonical +func TestExtractSuperKmersCanonical(t *testing.T) { + seq := []byte("ACGTACGTACGT") + k := 6 + m := 3 + + result := ExtractSuperKmers(seq, k, m, nil) + + if result == nil { + t.Fatal("expected non-nil result") + } + + for i, sk := range result { + // Verify the minimizer is indeed canonical (equal to its normalized form) + normalized := NormalizeKmer(sk.Minimizer, m) + if sk.Minimizer != normalized { + t.Errorf("super k-mer %d: minimizer %d is not canonical (normalized: %d)", + i, sk.Minimizer, normalized) + } + + // The minimizer should be <= its reverse complement + rc := ReverseComplement(sk.Minimizer, m) + if sk.Minimizer > rc { + t.Errorf("super k-mer %d: minimizer %d > reverse complement %d (not canonical)", + i, sk.Minimizer, rc) + } + } +} + +// TestExtractSuperKmersVariousKM tests various k and m combinations +func TestExtractSuperKmersVariousKM(t *testing.T) { + seq := []byte("ACGTACGTACGTACGTACGTACGT") + + configs := []struct { + k int + m int + }{ + {5, 3}, + {8, 4}, + {10, 5}, + {16, 8}, + {21, 11}, + {6, 5}, // m = k-1 + {4, 2}, + } + + for _, cfg := range configs { + t.Run("k"+string(rune('0'+cfg.k/10))+string(rune('0'+cfg.k%10))+"_m"+string(rune('0'+cfg.m/10))+string(rune('0'+cfg.m%10)), func(t *testing.T) { + if len(seq) < cfg.k { + t.Skip("sequence too short for this k") + } + + result := ExtractSuperKmers(seq, cfg.k, cfg.m, nil) + + if result == nil { + t.Fatal("expected non-nil result for valid parameters") + } + + if len(result) == 0 { + t.Error("expected at least one super k-mer") + } + + // Verify each super k-mer has minimum length k + for i, sk := range result { + length := sk.End - sk.Start + if length < cfg.k { + t.Errorf("super k-mer %d has length %d < k=%d", i, length, cfg.k) + } + } + }) + } +} + +// BenchmarkExtractSuperKmers benchmarks the super k-mer extraction +func BenchmarkExtractSuperKmers(b *testing.B) { + sizes := []int{100, 1000, 10000, 100000} + configs := []struct { + k int + m int + }{ + {21, 11}, + {31, 15}, + {16, 8}, + {10, 5}, + } + + for _, cfg := range configs { + for _, size := range sizes { + seq := make([]byte, size) + for i := range seq { + seq[i] = "ACGT"[i%4] + } + + name := "k" + string(rune('0'+cfg.k/10)) + string(rune('0'+cfg.k%10)) + + "_m" + string(rune('0'+cfg.m/10)) + string(rune('0'+cfg.m%10)) + + "_size" + string(rune('0'+(size/10000)%10)) + + string(rune('0'+(size/1000)%10)) + + string(rune('0'+(size/100)%10)) + + string(rune('0'+(size/10)%10)) + + string(rune('0'+size%10)) + + b.Run(name, func(b *testing.B) { + buffer := make([]SuperKmer, 0, size/cfg.k) + b.ResetTimer() + b.SetBytes(int64(size)) + + for i := 0; i < b.N; i++ { + ExtractSuperKmers(seq, cfg.k, cfg.m, &buffer) + } + }) + } + } +}