obitools4/pkg/obikmer/encodekmer_test.go

package obikmer

import (
	"bytes"
	"testing"
)

// TestEncodeKmersBasic tests basic k-mer encoding
func TestEncodeKmersBasic(t *testing.T) {
	tests := []struct {
		name     string
		seq      string
		k        int
		expected []uint64
	}{
		{
			name:     "simple 4-mer ACGT",
			seq:      "ACGT",
			k:        4,
			expected: []uint64{0b00011011}, // A=00, C=01, G=10, T=11 -> 00 01 10 11 = 27
		},
		{
			name:     "simple 2-mer AC",
			seq:      "AC",
			k:        2,
			expected: []uint64{0b0001}, // A=00, C=01 -> 00 01 = 1
		},
		{
			name:     "sliding 2-mer ACGT",
			seq:      "ACGT",
			k:        2,
			expected: []uint64{0b0001, 0b0110, 0b1011}, // AC=1, CG=6, GT=11
		},
		{
			name:     "lowercase",
			seq:      "acgt",
			k:        4,
			expected: []uint64{0b00011011},
		},
		{
			name:     "with U instead of T",
			seq:      "ACGU",
			k:        4,
			expected: []uint64{0b00011011}, // U encodes same as T
		},
		{
			name:     "8-mer",
			seq:      "ACGTACGT",
			k:        8,
			expected: []uint64{0b0001101100011011}, // ACGTACGT
		},
		{
			name:     "32-mer max size",
			seq:      "ACGTACGTACGTACGTACGTACGTACGTACGT",
			k:        32,
			expected: []uint64{0x1B1B1B1B1B1B1B1B}, // ACGTACGT repeated 4 times
		},
		{
			name: "longer sequence sliding",
			seq:  "AAACCCGGG",
			k:    3,
			expected: []uint64{
				0b000000, // AAA = 0
				0b000001, // AAC = 1
				0b000101, // ACC = 5
				0b010101, // CCC = 21
				0b010110, // CCG = 22
				0b011010, // CGG = 26
				0b101010, // GGG = 42
			},
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := EncodeKmers([]byte(tt.seq), tt.k, nil)

			if len(result) != len(tt.expected) {
				t.Errorf("length mismatch: got %d, want %d", len(result), len(tt.expected))
				return
			}

			for i, v := range result {
				if v != tt.expected[i] {
					t.Errorf("position %d: got %d (0b%b), want %d (0b%b)",
						i, v, v, tt.expected[i], tt.expected[i])
				}
			}
		})
	}
}

// TestEncodeKmersEdgeCases tests edge cases
func TestEncodeKmersEdgeCases(t *testing.T) {
	// Empty sequence
	result := EncodeKmers([]byte{}, 4, nil)
	if result != nil {
		t.Errorf("empty sequence should return nil, got %v", result)
	}

	// k > sequence length
	result = EncodeKmers([]byte("ACG"), 4, nil)
	if result != nil {
		t.Errorf("k > seq length should return nil, got %v", result)
	}

	// k = 0
	result = EncodeKmers([]byte("ACGT"), 0, nil)
	if result != nil {
		t.Errorf("k=0 should return nil, got %v", result)
	}

	// k > 32
	result = EncodeKmers([]byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGT"), 33, nil)
	if result != nil {
		t.Errorf("k>32 should return nil, got %v", result)
	}

	// k = sequence length (single k-mer)
	result = EncodeKmers([]byte("ACGT"), 4, nil)
	if len(result) != 1 {
		t.Errorf("k=seq_len should return 1 k-mer, got %d", len(result))
	}
}

// TestEncodeKmersBuffer tests buffer reuse
func TestEncodeKmersBuffer(t *testing.T) {
	seq := []byte("ACGTACGTACGT")
	k := 4

	// First call without buffer
	result1 := EncodeKmers(seq, k, nil)

	// Second call with buffer - pre-allocate with capacity
	buffer := make([]uint64, 0, 100)
	result2 := EncodeKmers(seq, k, &buffer)

	if len(result1) != len(result2) {
		t.Errorf("buffer reuse: length mismatch %d vs %d", len(result1), len(result2))
	}

	for i := range result1 {
		if result1[i] != result2[i] {
			t.Errorf("buffer reuse: position %d mismatch", i)
		}
	}

	// Verify results are correct
	if len(result2) == 0 {
		t.Errorf("result should not be empty")
	}

	// Test multiple calls with same buffer to verify no memory issues
	for i := 0; i < 10; i++ {
		result3 := EncodeKmers(seq, k, &buffer)
		if len(result3) != len(result1) {
			t.Errorf("iteration %d: length mismatch", i)
		}
	}
}

// TestEncodeKmersVariousLengths tests encoding with various sequence lengths
func TestEncodeKmersVariousLengths(t *testing.T) {
	lengths := []int{1, 4, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100, 256, 1000}
	k := 8

	for _, length := range lengths {
		// Generate test sequence
		seq := make([]byte, length)
		for i := range seq {
			seq[i] = "ACGT"[i%4]
		}

		if length < k {
			continue
		}

		t.Run("length_"+string(rune('0'+length/100))+string(rune('0'+(length%100)/10))+string(rune('0'+length%10)), func(t *testing.T) {
			result := EncodeKmers(seq, k, nil)

			expectedLen := length - k + 1
			if len(result) != expectedLen {
				t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
			}
		})
	}
}

// TestEncodeKmersLongSequence tests with a longer realistic sequence
func TestEncodeKmersLongSequence(t *testing.T) {
	// Simulate a realistic DNA sequence
	seq := bytes.Repeat([]byte("ACGTACGTNNACGTACGT"), 100)
	k := 16

	result := EncodeKmers(seq, k, nil)
	expectedLen := len(seq) - k + 1

	if len(result) != expectedLen {
		t.Fatalf("length mismatch: got %d, want %d", len(result), expectedLen)
	}
}

// BenchmarkEncodeKmers benchmarks the encoding function
func BenchmarkEncodeKmers(b *testing.B) {
	// Create test sequences of various sizes
	sizes := []int{100, 1000, 10000, 100000}
	kSizes := []int{8, 16, 32}

	for _, k := range kSizes {
		for _, size := range sizes {
			seq := make([]byte, size)
			for i := range seq {
				seq[i] = "ACGT"[i%4]
			}

			name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10))
			b.Run(name, func(b *testing.B) {
				buffer := make([]uint64, 0, size)
				b.ResetTimer()
				b.SetBytes(int64(size))

				for i := 0; i < b.N; i++ {
					EncodeKmers(seq, k, &buffer)
				}
			})
		}
	}
}

// TestEncodeNucleotide verifies nucleotide encoding
func TestEncodeNucleotide(t *testing.T) {
	testCases := []struct {
		nucleotide byte
		expected   byte
	}{
		{'A', 0},
		{'a', 0},
		{'C', 1},
		{'c', 1},
		{'G', 2},
		{'g', 2},
		{'T', 3},
		{'t', 3},
		{'U', 3},
		{'u', 3},
	}

	for _, tc := range testCases {
		result := EncodeNucleotide(tc.nucleotide)
		if result != tc.expected {
			t.Errorf("EncodeNucleotide('%c') = %d, want %d",
				tc.nucleotide, result, tc.expected)
		}
	}
}

// TestReverseComplement tests the reverse complement function
func TestReverseComplement(t *testing.T) {
	tests := []struct {
		name     string
		seq      string
		k        int
		expected string // expected reverse complement sequence
	}{
		{
			name:     "ACGT -> ACGT (palindrome)",
			seq:      "ACGT",
			k:        4,
			expected: "ACGT",
		},
		{
			name:     "AAAA -> TTTT",
			seq:      "AAAA",
			k:        4,
			expected: "TTTT",
		},
		{
			name:     "TTTT -> AAAA",
			seq:      "TTTT",
			k:        4,
			expected: "AAAA",
		},
		{
			name:     "CCCC -> GGGG",
			seq:      "CCCC",
			k:        4,
			expected: "GGGG",
		},
		{
			name:     "AACG -> CGTT",
			seq:      "AACG",
			k:        4,
			expected: "CGTT",
		},
		{
			name:     "AC -> GT",
			seq:      "AC",
			k:        2,
			expected: "GT",
		},
		{
			name:     "ACGTACGT -> ACGTACGT (palindrome)",
			seq:      "ACGTACGT",
			k:        8,
			expected: "ACGTACGT",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			// Encode the input sequence
			kmers := EncodeKmers([]byte(tt.seq), tt.k, nil)
			if len(kmers) != 1 {
				t.Fatalf("expected 1 k-mer, got %d", len(kmers))
			}

			// Compute reverse complement
			rc := ReverseComplement(kmers[0], tt.k)

			// Encode the expected reverse complement
			expectedKmers := EncodeKmers([]byte(tt.expected), tt.k, nil)
			if len(expectedKmers) != 1 {
				t.Fatalf("expected 1 k-mer for expected, got %d", len(expectedKmers))
			}

			if rc != expectedKmers[0] {
				t.Errorf("ReverseComplement(%s) = %d (0b%b), want %d (0b%b) for %s",
					tt.seq, rc, rc, expectedKmers[0], expectedKmers[0], tt.expected)
			}
		})
	}
}

// TestReverseComplementInvolution tests that RC(RC(x)) = x
func TestReverseComplementInvolution(t *testing.T) {
	testSeqs := []string{"ACGT", "AAAA", "TTTT", "ACGTACGT", "AACGTTGC", "AC", "ACGTACGTACGTACGT", "ACGTACGTACGTACGTACGTACGTACGTACGT"}

	for _, seq := range testSeqs {
		k := len(seq)
		kmers := EncodeKmers([]byte(seq), k, nil)
		if len(kmers) != 1 {
			continue
		}

		original := kmers[0]
		rc := ReverseComplement(original, k)
		rcrc := ReverseComplement(rc, k)

		if rcrc != original {
			t.Errorf("RC(RC(%s)) != %s: got %d, want %d", seq, seq, rcrc, original)
		}
	}
}

// TestNormalizeKmer tests the normalization function
func TestNormalizeKmer(t *testing.T) {
	tests := []struct {
		name string
		seq  string
		k    int
	}{
		{"ACGT palindrome", "ACGT", 4},
		{"AAAA vs TTTT", "AAAA", 4},
		{"TTTT vs AAAA", "TTTT", 4},
		{"AACG vs CGTT", "AACG", 4},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			kmers := EncodeKmers([]byte(tt.seq), tt.k, nil)
			if len(kmers) != 1 {
				t.Fatalf("expected 1 k-mer, got %d", len(kmers))
			}

			kmer := kmers[0]
			rc := ReverseComplement(kmer, tt.k)
			normalized := NormalizeKmer(kmer, tt.k)

			// Normalized should be the minimum
			expectedNorm := kmer
			if rc < kmer {
				expectedNorm = rc
			}

			if normalized != expectedNorm {
				t.Errorf("NormalizeKmer(%d) = %d, want %d", kmer, normalized, expectedNorm)
			}

			// Normalizing the RC should give the same result
			normalizedRC := NormalizeKmer(rc, tt.k)
			if normalizedRC != normalized {
				t.Errorf("NormalizeKmer(RC) = %d, want %d (same as NormalizeKmer(fwd))", normalizedRC, normalized)
			}
		})
	}
}

// TestEncodeNormalizedKmersBasic tests basic normalized k-mer encoding
func TestEncodeNormalizedKmersBasic(t *testing.T) {
	// Test that a sequence and its reverse complement produce the same normalized k-mers
	seq := []byte("AACGTT")
	revComp := []byte("AACGTT") // This is a palindrome!

	k := 4
	kmers1 := EncodeNormalizedKmers(seq, k, nil)
	kmers2 := EncodeNormalizedKmers(revComp, k, nil)

	if len(kmers1) != len(kmers2) {
		t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
	}

	// For a palindrome, forward and reverse should give the same k-mers
	for i := range kmers1 {
		if kmers1[i] != kmers2[len(kmers2)-1-i] {
			t.Logf("Note: position %d differs (expected for non-palindromic sequences)", i)
		}
	}
}

// TestEncodeNormalizedKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed)
func TestEncodeNormalizedKmersSymmetry(t *testing.T) {
	// Manually construct a sequence and its reverse complement
	seq := []byte("ACGTAACCGG")

	// Compute reverse complement manually
	rcMap := map[byte]byte{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
	revComp := make([]byte, len(seq))
	for i, b := range seq {
		revComp[len(seq)-1-i] = rcMap[b]
	}

	k := 4
	kmers1 := EncodeNormalizedKmers(seq, k, nil)
	kmers2 := EncodeNormalizedKmers(revComp, k, nil)

	if len(kmers1) != len(kmers2) {
		t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
	}

	// The normalized k-mers should be the same but in reverse order
	for i := range kmers1 {
		j := len(kmers2) - 1 - i
		if kmers1[i] != kmers2[j] {
			t.Errorf("position %d vs %d: %d != %d", i, j, kmers1[i], kmers2[j])
		}
	}
}

// TestEncodeNormalizedKmersConsistency verifies normalized k-mers match manual normalization
func TestEncodeNormalizedKmersConsistency(t *testing.T) {
	seq := []byte("ACGTACGTACGTACGT")
	k := 8

	// Get k-mers both ways
	rawKmers := EncodeKmers(seq, k, nil)
	normalizedKmers := EncodeNormalizedKmers(seq, k, nil)

	if len(rawKmers) != len(normalizedKmers) {
		t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers))
	}

	// Verify each normalized k-mer matches manual normalization
	for i, raw := range rawKmers {
		expected := NormalizeKmer(raw, k)
		if normalizedKmers[i] != expected {
			t.Errorf("position %d: EncodeNormalizedKmers gave %d, NormalizeKmer gave %d",
				i, normalizedKmers[i], expected)
		}
	}
}

// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function
func BenchmarkEncodeNormalizedKmers(b *testing.B) {
	sizes := []int{100, 1000, 10000, 100000}
	kSizes := []int{8, 16, 32}

	for _, k := range kSizes {
		for _, size := range sizes {
			seq := make([]byte, size)
			for i := range seq {
				seq[i] = "ACGT"[i%4]
			}

			name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10))
			b.Run(name, func(b *testing.B) {
				buffer := make([]uint64, 0, size)
				b.ResetTimer()
				b.SetBytes(int64(size))

				for i := 0; i < b.N; i++ {
					EncodeNormalizedKmers(seq, k, &buffer)
				}
			})
		}
	}
}

// BenchmarkReverseComplement benchmarks the reverse complement function
func BenchmarkReverseComplement(b *testing.B) {
	kmer := uint64(0x123456789ABCDEF0)
	k := 32

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		ReverseComplement(kmer, k)
	}
}

// BenchmarkNormalizeKmer benchmarks the normalization function
func BenchmarkNormalizeKmer(b *testing.B) {
	kmer := uint64(0x123456789ABCDEF0)
	k := 32

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		NormalizeKmer(kmer, k)
	}
}

// TestExtractSuperKmersBasic tests basic super k-mer extraction
func TestExtractSuperKmersBasic(t *testing.T) {
	tests := []struct {
		name     string
		seq      string
		k        int
		m        int
		validate func(*testing.T, []SuperKmer)
	}{
		{
			name: "simple sequence",
			seq:  "ACGTACGTACGT",
			k:    5,
			m:    3,
			validate: func(t *testing.T, sks []SuperKmer) {
				if len(sks) == 0 {
					t.Error("expected at least one super k-mer")
				}
				// Verify all super k-mers cover the sequence
				totalLen := 0
				for _, sk := range sks {
					totalLen += sk.End - sk.Start
					if string(sk.Sequence) != string([]byte(t.Name())[len(t.Name())-len(sk.Sequence):]) {
						// Just verify Start/End matches Sequence
						if string(sk.Sequence) != string([]byte("ACGTACGTACGT")[sk.Start:sk.End]) {
							t.Errorf("Sequence mismatch: seq[%d:%d] != %s", sk.Start, sk.End, sk.Sequence)
						}
					}
				}
			},
		},
		{
			name: "single k-mer sequence",
			seq:  "ACGTACGT",
			k:    8,
			m:    4,
			validate: func(t *testing.T, sks []SuperKmer) {
				if len(sks) != 1 {
					t.Errorf("expected exactly 1 super k-mer for len(seq)==k, got %d", len(sks))
				}
				if len(sks) > 0 {
					if sks[0].Start != 0 || sks[0].End != 8 {
						t.Errorf("expected [0:8], got [%d:%d]", sks[0].Start, sks[0].End)
					}
				}
			},
		},
		{
			name: "repeating sequence",
			seq:  "AAAAAAAAAA",
			k:    5,
			m:    3,
			validate: func(t *testing.T, sks []SuperKmer) {
				// Repeating A should have same minimizer (AAA) everywhere
				if len(sks) != 1 {
					t.Errorf("expected 1 super k-mer for repeating sequence, got %d", len(sks))
				}
				if len(sks) > 0 {
					if sks[0].Start != 0 || sks[0].End != 10 {
						t.Errorf("expected super k-mer to cover entire sequence [0:10], got [%d:%d]",
							sks[0].Start, sks[0].End)
					}
				}
			},
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := ExtractSuperKmers([]byte(tt.seq), tt.k, tt.m, nil)
			tt.validate(t, result)
		})
	}
}

// TestExtractSuperKmersEdgeCases tests edge cases and error handling
func TestExtractSuperKmersEdgeCases(t *testing.T) {
	tests := []struct {
		name      string
		seq       string
		k         int
		m         int
		expectNil bool
	}{
		{"empty sequence", "", 5, 3, true},
		{"seq shorter than k", "ACG", 5, 3, true},
		{"m < 1", "ACGTACGT", 5, 0, true},
		{"m >= k", "ACGTACGT", 5, 5, true},
		{"m == k-1 (valid)", "ACGTACGT", 5, 4, false},
		{"k < 2", "ACGTACGT", 1, 1, true},
		{"k > 32", "ACGTACGTACGTACGTACGTACGTACGTACGTACGT", 33, 16, true},
		{"k == 32 (valid)", "ACGTACGTACGTACGTACGTACGTACGTACGT", 32, 16, false},
		{"seq == k (valid)", "ACGTACGT", 8, 4, false},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := ExtractSuperKmers([]byte(tt.seq), tt.k, tt.m, nil)
			if tt.expectNil && result != nil {
				t.Errorf("expected nil, got %v", result)
			}
			if !tt.expectNil && result == nil {
				t.Errorf("expected non-nil result, got nil")
			}
		})
	}
}

// TestExtractSuperKmersBoundaries verifies Start/End positions
func TestExtractSuperKmersBoundaries(t *testing.T) {
	seq := []byte("ACGTACGTGGGGAAAA")
	k := 6
	m := 3

	result := ExtractSuperKmers(seq, k, m, nil)

	if result == nil {
		t.Fatal("expected non-nil result")
	}

	// Verify each super k-mer
	for i, sk := range result {
		// Verify Start < End
		if sk.Start >= sk.End {
			t.Errorf("super k-mer %d: Start (%d) >= End (%d)", i, sk.Start, sk.End)
		}

		// Verify Sequence matches seq[Start:End]
		expected := string(seq[sk.Start:sk.End])
		actual := string(sk.Sequence)
		if actual != expected {
			t.Errorf("super k-mer %d: Sequence mismatch: got %s, want %s", i, actual, expected)
		}

		// Verify bounds are within sequence
		if sk.Start < 0 || sk.End > len(seq) {
			t.Errorf("super k-mer %d: bounds [%d:%d] outside sequence length %d",
				i, sk.Start, sk.End, len(seq))
		}

		// Verify minimum length is k
		if sk.End-sk.Start < k {
			t.Errorf("super k-mer %d: length %d < k=%d", i, sk.End-sk.Start, k)
		}
	}

	// Verify super k-mers can overlap (by up to k-1 bases) but must be ordered
	// and the overlap should not exceed k-1
	for i := 0; i < len(result)-1; i++ {
		// Next super k-mer should start before or at the end of current one
		// Overlap is allowed and expected
		overlap := result[i].End - result[i+1].Start
		if overlap > k-1 {
			t.Errorf("super k-mers %d and %d overlap by %d bases (max allowed: %d): [%d:%d] and [%d:%d]",
				i, i+1, overlap, k-1, result[i].Start, result[i].End, result[i+1].Start, result[i+1].End)
		}
		// But the start positions should be ordered
		if result[i+1].Start < result[i].Start {
			t.Errorf("super k-mers %d and %d are not ordered: [%d:%d] and [%d:%d]",
				i, i+1, result[i].Start, result[i].End, result[i+1].Start, result[i+1].End)
		}
	}
}

// TestExtractSuperKmersBufferReuse tests buffer parameter
func TestExtractSuperKmersBufferReuse(t *testing.T) {
	seq := []byte("ACGTACGTACGTACGT")
	k := 6
	m := 3

	// First call without buffer
	result1 := ExtractSuperKmers(seq, k, m, nil)

	// Second call with buffer
	buffer := make([]SuperKmer, 0, 100)
	result2 := ExtractSuperKmers(seq, k, m, &buffer)

	if len(result1) != len(result2) {
		t.Errorf("buffer reuse: length mismatch %d vs %d", len(result1), len(result2))
	}

	for i := range result1 {
		if result1[i].Minimizer != result2[i].Minimizer {
			t.Errorf("position %d: minimizer mismatch", i)
		}
		if result1[i].Start != result2[i].Start || result1[i].End != result2[i].End {
			t.Errorf("position %d: boundary mismatch", i)
		}
	}

	// Test multiple calls with same buffer
	for i := 0; i < 10; i++ {
		result3 := ExtractSuperKmers(seq, k, m, &buffer)
		if len(result3) != len(result1) {
			t.Errorf("iteration %d: length mismatch", i)
		}
	}
}

// TestExtractSuperKmersCanonical verifies minimizers are canonical
func TestExtractSuperKmersCanonical(t *testing.T) {
	seq := []byte("ACGTACGTACGT")
	k := 6
	m := 3

	result := ExtractSuperKmers(seq, k, m, nil)

	if result == nil {
		t.Fatal("expected non-nil result")
	}

	for i, sk := range result {
		// Verify the minimizer is indeed canonical (equal to its normalized form)
		normalized := NormalizeKmer(sk.Minimizer, m)
		if sk.Minimizer != normalized {
			t.Errorf("super k-mer %d: minimizer %d is not canonical (normalized: %d)",
				i, sk.Minimizer, normalized)
		}

		// The minimizer should be <= its reverse complement
		rc := ReverseComplement(sk.Minimizer, m)
		if sk.Minimizer > rc {
			t.Errorf("super k-mer %d: minimizer %d > reverse complement %d (not canonical)",
				i, sk.Minimizer, rc)
		}
	}
}

// TestExtractSuperKmersVariousKM tests various k and m combinations
func TestExtractSuperKmersVariousKM(t *testing.T) {
	seq := []byte("ACGTACGTACGTACGTACGTACGT")

	configs := []struct {
		k int
		m int
	}{
		{5, 3},
		{8, 4},
		{10, 5},
		{16, 8},
		{21, 11},
		{6, 5}, // m = k-1
		{4, 2},
	}

	for _, cfg := range configs {
		t.Run("k"+string(rune('0'+cfg.k/10))+string(rune('0'+cfg.k%10))+"_m"+string(rune('0'+cfg.m/10))+string(rune('0'+cfg.m%10)), func(t *testing.T) {
			if len(seq) < cfg.k {
				t.Skip("sequence too short for this k")
			}

			result := ExtractSuperKmers(seq, cfg.k, cfg.m, nil)

			if result == nil {
				t.Fatal("expected non-nil result for valid parameters")
			}

			if len(result) == 0 {
				t.Error("expected at least one super k-mer")
			}

			// Verify each super k-mer has minimum length k
			for i, sk := range result {
				length := sk.End - sk.Start
				if length < cfg.k {
					t.Errorf("super k-mer %d has length %d < k=%d", i, length, cfg.k)
				}
			}
		})
	}
}

// BenchmarkExtractSuperKmers benchmarks the super k-mer extraction
func BenchmarkExtractSuperKmers(b *testing.B) {
	sizes := []int{100, 1000, 10000, 100000}
	configs := []struct {
		k int
		m int
	}{
		{21, 11},
		{31, 15},
		{16, 8},
		{10, 5},
	}

	for _, cfg := range configs {
		for _, size := range sizes {
			seq := make([]byte, size)
			for i := range seq {
				seq[i] = "ACGT"[i%4]
			}

			name := "k" + string(rune('0'+cfg.k/10)) + string(rune('0'+cfg.k%10)) +
				"_m" + string(rune('0'+cfg.m/10)) + string(rune('0'+cfg.m%10)) +
				"_size" + string(rune('0'+(size/10000)%10)) +
				string(rune('0'+(size/1000)%10)) +
				string(rune('0'+(size/100)%10)) +
				string(rune('0'+(size/10)%10)) +
				string(rune('0'+size%10))

			b.Run(name, func(b *testing.B) {
				buffer := make([]SuperKmer, 0, size/cfg.k)
				b.ResetTimer()
				b.SetBytes(int64(size))

				for i := 0; i < b.N; i++ {
					ExtractSuperKmers(seq, cfg.k, cfg.m, &buffer)
				}
			})
		}
	}
}