diff --git a/Makefile b/Makefile index 3ad2ffc..a3eeba0 100644 --- a/Makefile +++ b/Makefile @@ -108,5 +108,27 @@ ifneq ($(strip $(COMMIT_ID)),) @rm -f $(OUTPUT) endif -.PHONY: all obitools update-deps obitests githubtests .FORCE +jjnew: + @echo "$(YELLOW)→ Creating a new commit...$(NC)" + @echo "$(BLUE)→ Documenting current commit...$(NC)" + @jj auto-describe + @echo "$(BLUE)→ Done.$(NC)" + @jj new + @echo "$(GREEN)✓ New commit created$(NC)" + +jjpush: + @echo "$(YELLOW)→ Pushing commit to repository...$(NC)" + @echo "$(BLUE)→ Documenting current commit...$(NC)" + @jj auto-describe + @echo "$(BLUE)→ Done.$(NC)" + @jj git push --change @ + @echo "$(GREEN)✓ Commit pushed to repository$(NC)" + +jjfetch: + @echo "$(YELLOW)→ Pulling latest commits...$(NC)" + @jj git fetch + @jj new master@origin + @echo "$(GREEN)✓ Latest commits pulled$(NC)" + +.PHONY: all obitools update-deps obitests githubtests jjnew jjpush jjfetch .FORCE .FORCE: \ No newline at end of file diff --git a/blackboard/Prospective/kmer_index_design.md b/blackboard/Prospective/kmer_index_design.md new file mode 100644 index 0000000..b80cc6d --- /dev/null +++ b/blackboard/Prospective/kmer_index_design.md @@ -0,0 +1,213 @@ +# Index de k-mers pour génomes de grande taille + +## Contexte et objectifs + +### Cas d'usage + +- Indexation de k-mers longs (k=31) pour des génomes de grande taille (< 10 Go par génome) +- Nombre de génomes : plusieurs dizaines à quelques centaines +- Indexation en parallèle +- Stockage sur disque +- Possibilité d'ajouter des génomes, mais pas de modifier un génome existant + +### Requêtes cibles + +- **Présence/absence** d'un k-mer dans un génome +- **Intersection** entre génomes +- **Distances** : Jaccard (présence/absence) et potentiellement Bray-Curtis (comptage) + +### Ressources disponibles + +- 128 Go de RAM +- Stockage disque + +--- + +## Estimation des volumes + +### Par génome + +- **10 Go de séquence** → ~10¹⁰ k-mers bruts (chevauchants) +- **Après déduplication** : typiquement 10-50% de k-mers uniques → **~1-5 × 10⁹ k-mers distincts** + +### Espace théorique + +- **k=31** → 62 bits → ~4.6 × 10¹⁸ k-mers possibles +- Table d'indexation directe impossible + +--- + +## Métriques de distance + +### Présence/absence (binaire) + +- **Jaccard** : |A ∩ B| / |A ∪ B| +- **Sørensen-Dice** : 2|A ∩ B| / (|A| + |B|) + +### Comptage (abondance) + +- **Bray-Curtis** : 1 - (2 × Σ min(aᵢ, bᵢ)) / (Σ aᵢ + Σ bᵢ) + +Note : Pour Bray-Curtis, le stockage des comptages est nécessaire, ce qui augmente significativement la taille de l'index. + +--- + +## Options d'indexation + +### Option 1 : Bloom Filter par génome + +**Principe** : Structure probabiliste pour test d'appartenance. + +**Avantages :** +- Très compact : ~10 bits/élément pour FPR ~1% +- Construction rapide, streaming +- Facile à sérialiser/désérialiser +- Intersection et Jaccard estimables via formules analytiques + +**Inconvénients :** +- Faux positifs (pas de faux négatifs) +- Distances approximatives + +**Taille estimée** : 1-6 Go par génome (selon FPR cible) + +#### Dimensionnement des Bloom filters + +``` +\mathrm{FPR} ;=; \left(1 - e^{-h n / m}\right)^h +``` + + +| Bits/élément | FPR optimal | k (hash functions) | +|--------------|-------------|---------------------| +| 8 | ~2% | 5-6 | +| 10 | ~1% | 7 | +| 12 | ~0.3% | 8 | +| 16 | ~0.01% | 11 | + +Formule du taux de faux positifs : +``` +FPR ≈ (1 - e^(-kn/m))^k +``` +Où n = nombre d'éléments, m = nombre de bits, k = nombre de hash functions. + +### Option 2 : Ensemble trié de k-mers + +**Principe** : Stocker les k-mers (uint64) triés, avec compression possible. + +**Avantages :** +- Exact (pas de faux positifs) +- Intersection/union par merge sort O(n+m) +- Compression efficace (delta encoding sur k-mers triés) + +**Inconvénients :** +- Plus volumineux : 8 octets/k-mer +- Construction plus lente (tri nécessaire) + +**Taille estimée** : 8-40 Go par génome (non compressé) + +### Option 3 : MPHF (Minimal Perfect Hash Function) + +**Principe** : Fonction de hash parfaite minimale pour les k-mers présents. + +**Avantages :** +- Très compact : ~3-4 bits/élément +- Lookup O(1) +- Exact pour les k-mers présents + +**Inconvénients :** +- Construction coûteuse (plusieurs passes) +- Statique (pas d'ajout de k-mers après construction) +- Ne distingue pas "absent" vs "jamais vu" sans structure auxiliaire + +### Option 4 : Hybride MPHF + Bloom filter + +- MPHF pour mapping compact des k-mers présents +- Bloom filter pour pré-filtrage des absents + +--- + +## Optimisation : Indexation de (k-2)-mers pour requêtes k-mers + +### Principe + +Au lieu d'indexer directement les 31-mers dans un Bloom filter, on indexe les 29-mers. Pour tester la présence d'un 31-mer, on vérifie que les **trois 29-mers** qu'il contient sont présents : + +- positions 0-28 +- positions 1-29 +- positions 2-30 + +### Analyse probabiliste + +Si le Bloom filter a un FPR de p pour un 29-mer individuel, le FPR effectif pour un 31-mer devient **p³** (les trois requêtes doivent toutes être des faux positifs). + +| FPR 29-mer | FPR 31-mer effectif | +|------------|---------------------| +| 10% | 0.1% | +| 5% | 0.0125% | +| 1% | 0.0001% | + +### Avantages + +1. **Moins d'éléments à stocker** : il y a moins de 29-mers distincts que de 31-mers distincts dans un génome (deux 31-mers différents peuvent partager un même 29-mer) + +2. **FPR drastiquement réduit** : FPR³ avec seulement 3 requêtes + +3. **Index plus compact** : on peut utiliser moins de bits par élément (FPR plus élevé acceptable sur le 29-mer) tout en obtenant un FPR très bas sur le 31-mer + +### Trade-off + +Un Bloom filter à **5-6 bits/élément** pour les 29-mers donnerait un FPR effectif < 0.01% pour les 31-mers, soit environ **2× plus compact** que l'approche directe à qualité égale. + +**Coût** : 3× plus de requêtes par lookup (mais les requêtes Bloom sont très rapides). + +--- + +## Accélération des calculs de distance : MinHash + +### Principe + +Pré-calculer une "signature" compacte (sketch) de chaque génome permettant d'estimer rapidement Jaccard sans charger les index complets. + +### Avantages + +- Matrice de distances entre 100+ génomes en quelques secondes +- Signature de taille fixe (ex: 1000-10000 hash values) quel que soit le génome +- Stockage minimal + +### Utilisation + +1. Construction : une passe sur les k-mers de chaque génome +2. Distance : comparaison des sketches en O(taille du sketch) + +--- + +## Architecture recommandée + +### Pour présence/absence + Jaccard + +1. **Index principal** : Bloom filter de (k-2)-mers avec l'optimisation décrite + - Compact (~3-5 Go par génome) + - FPR très bas pour les k-mers grâce aux requêtes triples + +2. **Sketches MinHash** : pour calcul rapide des distances entre génomes + - Quelques Ko par génome + - Permet exploration rapide de la matrice de distances + +### Pour comptage + Bray-Curtis + +1. **Index principal** : k-mers triés + comptages + - uint64 (k-mer) + uint8/uint16 (count) + - Compression delta possible + - Plus volumineux mais exact + +2. **Sketches** : variantes de MinHash pour données pondérées (ex: HyperMinHash) + +--- + +## Prochaines étapes + +1. Implémenter un Bloom filter optimisé pour k-mers +2. Implémenter l'optimisation (k-2)-mer → k-mer +3. Implémenter MinHash pour les sketches +4. Définir le format de sérialisation sur disque +5. Benchmarker sur des génomes réels diff --git a/pkg/obikmer/encodekmer.go b/pkg/obikmer/encodekmer.go new file mode 100644 index 0000000..7604ea8 --- /dev/null +++ b/pkg/obikmer/encodekmer.go @@ -0,0 +1,183 @@ +package obikmer + +// EncodeKmers converts a DNA sequence to a slice of encoded k-mers. +// Each nucleotide is encoded on 2 bits according to __single_base_code__: +// - A = 0 (00) +// - C = 1 (01) +// - G = 2 (10) +// - T/U = 3 (11) +// +// The function returns overlapping k-mers of size k encoded as uint64. +// For a sequence of length n, it returns n-k+1 k-mers. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between 1 and 32) +// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. +// +// Returns: +// - slice of uint64 encoded k-mers +// - nil if sequence is shorter than k or k is invalid +func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 { + if k < 1 || k > 32 || len(seq) < k { + return nil + } + + n := len(seq) - k + 1 + + var result []uint64 + if buffer == nil { + result = make([]uint64, 0, n) + } else { + result = (*buffer)[:0] + } + + // Mask to keep only k*2 bits + mask := uint64(1)<<(k*2) - 1 + + // Build the first k-mer + var kmer uint64 + for i := 0; i < k; i++ { + kmer <<= 2 + kmer |= uint64(__single_base_code__[seq[i]&31]) + } + result = append(result, kmer) + + // Slide through the rest of the sequence + for i := k; i < len(seq); i++ { + kmer <<= 2 + kmer |= uint64(__single_base_code__[seq[i]&31]) + kmer &= mask + result = append(result, kmer) + } + + return result +} + +// ReverseComplement computes the reverse complement of an encoded k-mer. +// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11). +// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11. +// The reverse swaps the order of 2-bit pairs. +// +// Parameters: +// - kmer: the encoded k-mer +// - k: the k-mer size (number of nucleotides) +// +// Returns: +// - the reverse complement of the k-mer +func ReverseComplement(kmer uint64, k int) uint64 { + // Step 1: Complement - XOR with all 1s to flip A↔T and C↔G + // For a k-mer of size k, we only want to flip the lower k*2 bits + mask := uint64(1)<<(k*2) - 1 + rc := (^kmer) & mask + + // Step 2: Reverse the order of 2-bit pairs + // We use a series of swaps at increasing granularity + rc = ((rc & 0x3333333333333333) << 2) | ((rc & 0xCCCCCCCCCCCCCCCC) >> 2) // Swap adjacent pairs + rc = ((rc & 0x0F0F0F0F0F0F0F0F) << 4) | ((rc & 0xF0F0F0F0F0F0F0F0) >> 4) // Swap nibbles + rc = ((rc & 0x00FF00FF00FF00FF) << 8) | ((rc & 0xFF00FF00FF00FF00) >> 8) // Swap bytes + rc = ((rc & 0x0000FFFF0000FFFF) << 16) | ((rc & 0xFFFF0000FFFF0000) >> 16) // Swap 16-bit words + rc = (rc << 32) | (rc >> 32) // Swap 32-bit words + + // Step 3: Shift right to align the k-mer (we reversed all 32 pairs, need only k) + rc >>= (64 - k*2) + + return rc +} + +// NormalizeKmer returns the lexicographically smaller of a k-mer and its +// reverse complement. This canonical form ensures that a k-mer and its +// reverse complement map to the same value. +// +// Parameters: +// - kmer: the encoded k-mer +// - k: the k-mer size (number of nucleotides) +// +// Returns: +// - the canonical (normalized) k-mer +func NormalizeKmer(kmer uint64, k int) uint64 { + rc := ReverseComplement(kmer, k) + if rc < kmer { + return rc + } + return kmer +} + +// EncodeNormalizedKmers converts a DNA sequence to a slice of normalized k-mers. +// Each k-mer is replaced by the lexicographically smaller of itself and its +// reverse complement. This ensures that forward and reverse complement sequences +// produce the same k-mer set. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between 1 and 32) +// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. +// +// Returns: +// - slice of uint64 normalized k-mers +// - nil if sequence is shorter than k or k is invalid +func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 { + if k < 1 || k > 32 || len(seq) < k { + return nil + } + + n := len(seq) - k + 1 + + var result []uint64 + if buffer == nil { + result = make([]uint64, 0, n) + } else { + result = (*buffer)[:0] + } + + // Mask to keep only k*2 bits + mask := uint64(1)<<(k*2) - 1 + + // Shift amount for adding to reverse complement (high position) + rcShift := uint((k - 1) * 2) + + // Complement lookup: A(00)->T(11), C(01)->G(10), G(10)->C(01), T(11)->A(00) + // This is simply XOR with 3 + + // Build the first k-mer (forward and reverse complement) + var fwd, rvc uint64 + for i := 0; i < k; i++ { + code := uint64(__single_base_code__[seq[i]&31]) + // Forward: shift left and add new code at low end + fwd <<= 2 + fwd |= code + // Reverse complement: shift right and add complement at high end + rvc >>= 2 + rvc |= (code ^ 3) << rcShift + } + + // Store the normalized (canonical) k-mer + if fwd <= rvc { + result = append(result, fwd) + } else { + result = append(result, rvc) + } + + // Slide through the rest of the sequence + for i := k; i < len(seq); i++ { + code := uint64(__single_base_code__[seq[i]&31]) + + // Update forward k-mer: shift left, add new code, mask + fwd <<= 2 + fwd |= code + fwd &= mask + + // Update reverse complement: shift right, add complement at high end + rvc >>= 2 + rvc |= (code ^ 3) << rcShift + + // Store the normalized k-mer + if fwd <= rvc { + result = append(result, fwd) + } else { + result = append(result, rvc) + } + } + + return result +} diff --git a/pkg/obikmer/encodekmer_test.go b/pkg/obikmer/encodekmer_test.go new file mode 100644 index 0000000..e89c0c3 --- /dev/null +++ b/pkg/obikmer/encodekmer_test.go @@ -0,0 +1,518 @@ +package obikmer + +import ( + "bytes" + "testing" +) + +// TestEncodeKmersBasic tests basic k-mer encoding +func TestEncodeKmersBasic(t *testing.T) { + tests := []struct { + name string + seq string + k int + expected []uint64 + }{ + { + name: "simple 4-mer ACGT", + seq: "ACGT", + k: 4, + expected: []uint64{0b00011011}, // A=00, C=01, G=10, T=11 -> 00 01 10 11 = 27 + }, + { + name: "simple 2-mer AC", + seq: "AC", + k: 2, + expected: []uint64{0b0001}, // A=00, C=01 -> 00 01 = 1 + }, + { + name: "sliding 2-mer ACGT", + seq: "ACGT", + k: 2, + expected: []uint64{0b0001, 0b0110, 0b1011}, // AC=1, CG=6, GT=11 + }, + { + name: "lowercase", + seq: "acgt", + k: 4, + expected: []uint64{0b00011011}, + }, + { + name: "with U instead of T", + seq: "ACGU", + k: 4, + expected: []uint64{0b00011011}, // U encodes same as T + }, + { + name: "8-mer", + seq: "ACGTACGT", + k: 8, + expected: []uint64{0b0001101100011011}, // ACGTACGT + }, + { + name: "32-mer max size", + seq: "ACGTACGTACGTACGTACGTACGTACGTACGT", + k: 32, + expected: []uint64{0x1B1B1B1B1B1B1B1B}, // ACGTACGT repeated 4 times + }, + { + name: "longer sequence sliding", + seq: "AAACCCGGG", + k: 3, + expected: []uint64{ + 0b000000, // AAA = 0 + 0b000001, // AAC = 1 + 0b000101, // ACC = 5 + 0b010101, // CCC = 21 + 0b010110, // CCG = 22 + 0b011010, // CGG = 26 + 0b101010, // GGG = 42 + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := EncodeKmers([]byte(tt.seq), tt.k, nil) + + if len(result) != len(tt.expected) { + t.Errorf("length mismatch: got %d, want %d", len(result), len(tt.expected)) + return + } + + for i, v := range result { + if v != tt.expected[i] { + t.Errorf("position %d: got %d (0b%b), want %d (0b%b)", + i, v, v, tt.expected[i], tt.expected[i]) + } + } + }) + } +} + +// TestEncodeKmersEdgeCases tests edge cases +func TestEncodeKmersEdgeCases(t *testing.T) { + // Empty sequence + result := EncodeKmers([]byte{}, 4, nil) + if result != nil { + t.Errorf("empty sequence should return nil, got %v", result) + } + + // k > sequence length + result = EncodeKmers([]byte("ACG"), 4, nil) + if result != nil { + t.Errorf("k > seq length should return nil, got %v", result) + } + + // k = 0 + result = EncodeKmers([]byte("ACGT"), 0, nil) + if result != nil { + t.Errorf("k=0 should return nil, got %v", result) + } + + // k > 32 + result = EncodeKmers([]byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGT"), 33, nil) + if result != nil { + t.Errorf("k>32 should return nil, got %v", result) + } + + // k = sequence length (single k-mer) + result = EncodeKmers([]byte("ACGT"), 4, nil) + if len(result) != 1 { + t.Errorf("k=seq_len should return 1 k-mer, got %d", len(result)) + } +} + +// TestEncodeKmersBuffer tests buffer reuse +func TestEncodeKmersBuffer(t *testing.T) { + seq := []byte("ACGTACGTACGT") + k := 4 + + // First call without buffer + result1 := EncodeKmers(seq, k, nil) + + // Second call with buffer - pre-allocate with capacity + buffer := make([]uint64, 0, 100) + result2 := EncodeKmers(seq, k, &buffer) + + if len(result1) != len(result2) { + t.Errorf("buffer reuse: length mismatch %d vs %d", len(result1), len(result2)) + } + + for i := range result1 { + if result1[i] != result2[i] { + t.Errorf("buffer reuse: position %d mismatch", i) + } + } + + // Verify results are correct + if len(result2) == 0 { + t.Errorf("result should not be empty") + } + + // Test multiple calls with same buffer to verify no memory issues + for i := 0; i < 10; i++ { + result3 := EncodeKmers(seq, k, &buffer) + if len(result3) != len(result1) { + t.Errorf("iteration %d: length mismatch", i) + } + } +} + +// TestEncodeKmersVariousLengths tests encoding with various sequence lengths +func TestEncodeKmersVariousLengths(t *testing.T) { + lengths := []int{1, 4, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100, 256, 1000} + k := 8 + + for _, length := range lengths { + // Generate test sequence + seq := make([]byte, length) + for i := range seq { + seq[i] = "ACGT"[i%4] + } + + if length < k { + continue + } + + t.Run("length_"+string(rune('0'+length/100))+string(rune('0'+(length%100)/10))+string(rune('0'+length%10)), func(t *testing.T) { + result := EncodeKmers(seq, k, nil) + + expectedLen := length - k + 1 + if len(result) != expectedLen { + t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen) + } + }) + } +} + +// TestEncodeKmersLongSequence tests with a longer realistic sequence +func TestEncodeKmersLongSequence(t *testing.T) { + // Simulate a realistic DNA sequence + seq := bytes.Repeat([]byte("ACGTACGTNNACGTACGT"), 100) + k := 16 + + result := EncodeKmers(seq, k, nil) + expectedLen := len(seq) - k + 1 + + if len(result) != expectedLen { + t.Fatalf("length mismatch: got %d, want %d", len(result), expectedLen) + } +} + +// BenchmarkEncodeKmers benchmarks the encoding function +func BenchmarkEncodeKmers(b *testing.B) { + // Create test sequences of various sizes + sizes := []int{100, 1000, 10000, 100000} + kSizes := []int{8, 16, 32} + + for _, k := range kSizes { + for _, size := range sizes { + seq := make([]byte, size) + for i := range seq { + seq[i] = "ACGT"[i%4] + } + + name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10)) + b.Run(name, func(b *testing.B) { + buffer := make([]uint64, 0, size) + b.ResetTimer() + b.SetBytes(int64(size)) + + for i := 0; i < b.N; i++ { + EncodeKmers(seq, k, &buffer) + } + }) + } + } +} + +// TestEncodeNucleotide verifies nucleotide encoding +func TestEncodeNucleotide(t *testing.T) { + testCases := []struct { + nucleotide byte + expected byte + }{ + {'A', 0}, + {'a', 0}, + {'C', 1}, + {'c', 1}, + {'G', 2}, + {'g', 2}, + {'T', 3}, + {'t', 3}, + {'U', 3}, + {'u', 3}, + } + + for _, tc := range testCases { + result := EncodeNucleotide(tc.nucleotide) + if result != tc.expected { + t.Errorf("EncodeNucleotide('%c') = %d, want %d", + tc.nucleotide, result, tc.expected) + } + } +} + +// TestReverseComplement tests the reverse complement function +func TestReverseComplement(t *testing.T) { + tests := []struct { + name string + seq string + k int + expected string // expected reverse complement sequence + }{ + { + name: "ACGT -> ACGT (palindrome)", + seq: "ACGT", + k: 4, + expected: "ACGT", + }, + { + name: "AAAA -> TTTT", + seq: "AAAA", + k: 4, + expected: "TTTT", + }, + { + name: "TTTT -> AAAA", + seq: "TTTT", + k: 4, + expected: "AAAA", + }, + { + name: "CCCC -> GGGG", + seq: "CCCC", + k: 4, + expected: "GGGG", + }, + { + name: "AACG -> CGTT", + seq: "AACG", + k: 4, + expected: "CGTT", + }, + { + name: "AC -> GT", + seq: "AC", + k: 2, + expected: "GT", + }, + { + name: "ACGTACGT -> ACGTACGT (palindrome)", + seq: "ACGTACGT", + k: 8, + expected: "ACGTACGT", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Encode the input sequence + kmers := EncodeKmers([]byte(tt.seq), tt.k, nil) + if len(kmers) != 1 { + t.Fatalf("expected 1 k-mer, got %d", len(kmers)) + } + + // Compute reverse complement + rc := ReverseComplement(kmers[0], tt.k) + + // Encode the expected reverse complement + expectedKmers := EncodeKmers([]byte(tt.expected), tt.k, nil) + if len(expectedKmers) != 1 { + t.Fatalf("expected 1 k-mer for expected, got %d", len(expectedKmers)) + } + + if rc != expectedKmers[0] { + t.Errorf("ReverseComplement(%s) = %d (0b%b), want %d (0b%b) for %s", + tt.seq, rc, rc, expectedKmers[0], expectedKmers[0], tt.expected) + } + }) + } +} + +// TestReverseComplementInvolution tests that RC(RC(x)) = x +func TestReverseComplementInvolution(t *testing.T) { + testSeqs := []string{"ACGT", "AAAA", "TTTT", "ACGTACGT", "AACGTTGC", "AC", "ACGTACGTACGTACGT", "ACGTACGTACGTACGTACGTACGTACGTACGT"} + + for _, seq := range testSeqs { + k := len(seq) + kmers := EncodeKmers([]byte(seq), k, nil) + if len(kmers) != 1 { + continue + } + + original := kmers[0] + rc := ReverseComplement(original, k) + rcrc := ReverseComplement(rc, k) + + if rcrc != original { + t.Errorf("RC(RC(%s)) != %s: got %d, want %d", seq, seq, rcrc, original) + } + } +} + +// TestNormalizeKmer tests the normalization function +func TestNormalizeKmer(t *testing.T) { + tests := []struct { + name string + seq string + k int + }{ + {"ACGT palindrome", "ACGT", 4}, + {"AAAA vs TTTT", "AAAA", 4}, + {"TTTT vs AAAA", "TTTT", 4}, + {"AACG vs CGTT", "AACG", 4}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + kmers := EncodeKmers([]byte(tt.seq), tt.k, nil) + if len(kmers) != 1 { + t.Fatalf("expected 1 k-mer, got %d", len(kmers)) + } + + kmer := kmers[0] + rc := ReverseComplement(kmer, tt.k) + normalized := NormalizeKmer(kmer, tt.k) + + // Normalized should be the minimum + expectedNorm := kmer + if rc < kmer { + expectedNorm = rc + } + + if normalized != expectedNorm { + t.Errorf("NormalizeKmer(%d) = %d, want %d", kmer, normalized, expectedNorm) + } + + // Normalizing the RC should give the same result + normalizedRC := NormalizeKmer(rc, tt.k) + if normalizedRC != normalized { + t.Errorf("NormalizeKmer(RC) = %d, want %d (same as NormalizeKmer(fwd))", normalizedRC, normalized) + } + }) + } +} + +// TestEncodeNormalizedKmersBasic tests basic normalized k-mer encoding +func TestEncodeNormalizedKmersBasic(t *testing.T) { + // Test that a sequence and its reverse complement produce the same normalized k-mers + seq := []byte("AACGTT") + revComp := []byte("AACGTT") // This is a palindrome! + + k := 4 + kmers1 := EncodeNormalizedKmers(seq, k, nil) + kmers2 := EncodeNormalizedKmers(revComp, k, nil) + + if len(kmers1) != len(kmers2) { + t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) + } + + // For a palindrome, forward and reverse should give the same k-mers + for i := range kmers1 { + if kmers1[i] != kmers2[len(kmers2)-1-i] { + t.Logf("Note: position %d differs (expected for non-palindromic sequences)", i) + } + } +} + +// TestEncodeNormalizedKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed) +func TestEncodeNormalizedKmersSymmetry(t *testing.T) { + // Manually construct a sequence and its reverse complement + seq := []byte("ACGTAACCGG") + + // Compute reverse complement manually + rcMap := map[byte]byte{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + revComp := make([]byte, len(seq)) + for i, b := range seq { + revComp[len(seq)-1-i] = rcMap[b] + } + + k := 4 + kmers1 := EncodeNormalizedKmers(seq, k, nil) + kmers2 := EncodeNormalizedKmers(revComp, k, nil) + + if len(kmers1) != len(kmers2) { + t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2)) + } + + // The normalized k-mers should be the same but in reverse order + for i := range kmers1 { + j := len(kmers2) - 1 - i + if kmers1[i] != kmers2[j] { + t.Errorf("position %d vs %d: %d != %d", i, j, kmers1[i], kmers2[j]) + } + } +} + +// TestEncodeNormalizedKmersConsistency verifies normalized k-mers match manual normalization +func TestEncodeNormalizedKmersConsistency(t *testing.T) { + seq := []byte("ACGTACGTACGTACGT") + k := 8 + + // Get k-mers both ways + rawKmers := EncodeKmers(seq, k, nil) + normalizedKmers := EncodeNormalizedKmers(seq, k, nil) + + if len(rawKmers) != len(normalizedKmers) { + t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers)) + } + + // Verify each normalized k-mer matches manual normalization + for i, raw := range rawKmers { + expected := NormalizeKmer(raw, k) + if normalizedKmers[i] != expected { + t.Errorf("position %d: EncodeNormalizedKmers gave %d, NormalizeKmer gave %d", + i, normalizedKmers[i], expected) + } + } +} + +// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function +func BenchmarkEncodeNormalizedKmers(b *testing.B) { + sizes := []int{100, 1000, 10000, 100000} + kSizes := []int{8, 16, 32} + + for _, k := range kSizes { + for _, size := range sizes { + seq := make([]byte, size) + for i := range seq { + seq[i] = "ACGT"[i%4] + } + + name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10)) + b.Run(name, func(b *testing.B) { + buffer := make([]uint64, 0, size) + b.ResetTimer() + b.SetBytes(int64(size)) + + for i := 0; i < b.N; i++ { + EncodeNormalizedKmers(seq, k, &buffer) + } + }) + } + } +} + +// BenchmarkReverseComplement benchmarks the reverse complement function +func BenchmarkReverseComplement(b *testing.B) { + kmer := uint64(0x123456789ABCDEF0) + k := 32 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + ReverseComplement(kmer, k) + } +} + +// BenchmarkNormalizeKmer benchmarks the normalization function +func BenchmarkNormalizeKmer(b *testing.B) { + kmer := uint64(0x123456789ABCDEF0) + k := 32 + + b.ResetTimer() + for i := 0; i < b.N; i++ { + NormalizeKmer(kmer, k) + } +}