implementation de obilowmask

2026-02-03 06:40:33 +00:00 · 2025-10-20 17:43:25 +02:00
parent 07cdd6f758
commit 4603d7973e
13 changed files with 12636 additions and 2 deletions
--- a/pkg/obikmer/encodefourmer.go
+++ b/pkg/obikmer/encodefourmer.go
@@ -23,6 +23,10 @@ var __single_base_code__ = []byte{0,
 	0, 0, 0,
 }

+func EncodeNucleotide(b byte) byte {
+	return __single_base_code__[b&31]
+}
+
 // Encode4mer transforms an obiseq.BioSequence into a sequence
 // of kmer of length 4. Each letter of the sequence not belonging
 // A, C, G, T, U are considered as a A. The kmer is encoded as a byte
@@ -54,7 +58,7 @@ func Encode4mer(seq *obiseq.BioSequence, buffer *[]byte) []byte {
 	code = 0
 	for ; i < 4; i++ {
 		code <<= 2
-		code += __single_base_code__[rawseq[i]&31]
+		code += EncodeNucleotide(rawseq[i])
 	}

 	*buffer = append((*buffer), code)
--- a/pkg/obikmer/kmernorm.go
+++ b/pkg/obikmer/kmernorm.go
--- a/pkg/obikmer/kmernorm_test.go
+++ b/pkg/obikmer/kmernorm_test.go
@@ -0,0 +1,77 @@
+package obikmer
+
+import "testing"
+
+func TestNormalize(t *testing.T) {
+	tests := []struct {
+		name     string
+		kmer     string
+		expected string
+	}{
+		// Test avec k=1
+		{"k=1 a", "a", "a"},
+		{"k=1 c", "c", "c"},
+
+		// Test avec k=2
+		{"k=2 ca", "ca", "ac"},
+		{"k=2 ac", "ac", "ac"},
+
+		// Test avec k=4
+		{"k=4 acgt", "acgt", "acgt"},
+		{"k=4 cgta", "cgta", "acgt"},
+		{"k=4 gtac", "gtac", "acgt"},
+		{"k=4 tacg", "tacg", "acgt"},
+		{"k=4 tgca", "tgca", "atgc"},
+
+		// Test avec k=6
+		{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
+		{"k=6 tttttt", "tttttt", "tttttt"},
+
+		// Test avec k>6 (calcul à la volée)
+		{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
+		{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
+		{"k=7 gcatgct", "gcatgct", "atgctgc"},
+		{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
+		{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
+		{"k=10 acgtacgtac", "acgtacgtac", "acacgtacgt"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := Normalize(tt.kmer)
+			if result != tt.expected {
+				t.Errorf("Normalize(%q) = %q, want %q", tt.kmer, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestNormalizeTableConsistency(t *testing.T) {
+	// Vérifier que tous les kmers de la table donnent le bon résultat
+	// en comparant avec le calcul à la volée
+	for kmer, expected := range LexicographicNormalization {
+		calculated := getCanonicalCircular(kmer)
+		if calculated != expected {
+			t.Errorf("Table inconsistency for %q: table=%q, calculated=%q",
+				kmer, expected, calculated)
+		}
+	}
+}
+
+func BenchmarkNormalizeSmall(b *testing.B) {
+	// Benchmark pour k<=6 (utilise la table)
+	kmer := "acgtac"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = Normalize(kmer)
+	}
+}
+
+func BenchmarkNormalizeLarge(b *testing.B) {
+	// Benchmark pour k>6 (calcul à la volée)
+	kmer := "acgtacgtac"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = Normalize(kmer)
+	}
+}
--- a/pkg/obikmer/kmernormint.go
+++ b/pkg/obikmer/kmernormint.go
--- a/pkg/obikmer/kmernormint_test.go
+++ b/pkg/obikmer/kmernormint_test.go
@@ -0,0 +1,357 @@
+package obikmer
+
+import (
+	"fmt"
+	"testing"
+)
+
+func TestEncodeDecodeKmer(t *testing.T) {
+	tests := []struct {
+		kmer string
+		code int
+	}{
+		{"a", 0},
+		{"c", 1},
+		{"g", 2},
+		{"t", 3},
+		{"aa", 0},
+		{"ac", 1},
+		{"ca", 4},
+		{"acgt", 27}, // 0b00011011
+		{"cgta", 108}, // 0b01101100
+		{"tttt", 255}, // 0b11111111
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.kmer, func(t *testing.T) {
+			// Test encoding
+			encoded := EncodeKmer(tt.kmer)
+			if encoded != tt.code {
+				t.Errorf("EncodeKmer(%q) = %d, want %d", tt.kmer, encoded, tt.code)
+			}
+
+			// Test decoding
+			decoded := DecodeKmer(tt.code, len(tt.kmer))
+			if decoded != tt.kmer {
+				t.Errorf("DecodeKmer(%d, %d) = %q, want %q", tt.code, len(tt.kmer), decoded, tt.kmer)
+			}
+		})
+	}
+}
+
+func TestNormalizeInt(t *testing.T) {
+	tests := []struct {
+		name     string
+		kmer     string
+		expected string
+	}{
+		// Test avec k=1
+		{"k=1 a", "a", "a"},
+		{"k=1 c", "c", "c"},
+
+		// Test avec k=2
+		{"k=2 ca", "ca", "ac"},
+		{"k=2 ac", "ac", "ac"},
+		{"k=2 ta", "ta", "at"},
+
+		// Test avec k=4 - toutes les rotations de "acgt"
+		{"k=4 acgt", "acgt", "acgt"},
+		{"k=4 cgta", "cgta", "acgt"},
+		{"k=4 gtac", "gtac", "acgt"},
+		{"k=4 tacg", "tacg", "acgt"},
+
+		// Test avec k=4 - rotations de "tgca"
+		{"k=4 tgca", "tgca", "atgc"},
+		{"k=4 gcat", "gcat", "atgc"},
+		{"k=4 catg", "catg", "atgc"},
+		{"k=4 atgc", "atgc", "atgc"},
+
+		// Test avec k=3 - rotations de "atg"
+		{"k=3 atg", "atg", "atg"},
+		{"k=3 tga", "tga", "atg"},
+		{"k=3 gat", "gat", "atg"},
+
+		// Test avec k=6
+		{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
+		{"k=6 tttttt", "tttttt", "tttttt"},
+
+		// Test avec k>6 (calcul à la volée)
+		{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
+		{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
+		{"k=7 gcatgct", "gcatgct", "atgctgc"},
+		{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
+		{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			kmerCode := EncodeKmer(tt.kmer)
+			expectedCode := EncodeKmer(tt.expected)
+
+			result := NormalizeInt(kmerCode, len(tt.kmer))
+
+			if result != expectedCode {
+				resultKmer := DecodeKmer(result, len(tt.kmer))
+				t.Errorf("NormalizeInt(%q) = %q (code %d), want %q (code %d)",
+					tt.kmer, resultKmer, result, tt.expected, expectedCode)
+			}
+		})
+	}
+}
+
+func TestNormalizeIntConsistencyWithString(t *testing.T) {
+	// Vérifier que NormalizeInt donne le même résultat que Normalize
+	// pour tous les k-mers de taille 1 à 4 (pour ne pas trop ralentir les tests)
+	bases := []byte{'a', 'c', 'g', 't'}
+
+	var testKmers func(current string, maxSize int)
+	testKmers = func(current string, maxSize int) {
+		if len(current) > 0 {
+			// Test normalization
+			normalizedStr := Normalize(current)
+			normalizedStrCode := EncodeKmer(normalizedStr)
+
+			kmerCode := EncodeKmer(current)
+			normalizedInt := NormalizeInt(kmerCode, len(current))
+
+			if normalizedInt != normalizedStrCode {
+				normalizedIntStr := DecodeKmer(normalizedInt, len(current))
+				t.Errorf("Inconsistency for %q: Normalize=%q (code %d), NormalizeInt=%q (code %d)",
+					current, normalizedStr, normalizedStrCode, normalizedIntStr, normalizedInt)
+			}
+		}
+
+		if len(current) < maxSize {
+			for _, base := range bases {
+				testKmers(current+string(base), maxSize)
+			}
+		}
+	}
+
+	testKmers("", 4) // Test jusqu'à k=4 pour rester raisonnable
+}
+
+func TestCircularRotations(t *testing.T) {
+	// Test que toutes les rotations circulaires donnent le même canonical
+	tests := []struct {
+		kmers []string
+		canonical string
+	}{
+		{[]string{"atg", "tga", "gat"}, "atg"},
+		{[]string{"acgt", "cgta", "gtac", "tacg"}, "acgt"},
+		{[]string{"tgca", "gcat", "catg", "atgc"}, "atgc"},
+	}
+
+	for _, tt := range tests {
+		canonicalCode := EncodeKmer(tt.canonical)
+
+		for _, kmer := range tt.kmers {
+			kmerCode := EncodeKmer(kmer)
+			result := NormalizeInt(kmerCode, len(kmer))
+
+			if result != canonicalCode {
+				resultKmer := DecodeKmer(result, len(kmer))
+				t.Errorf("NormalizeInt(%q) = %q, want %q", kmer, resultKmer, tt.canonical)
+			}
+		}
+	}
+}
+
+func BenchmarkNormalizeIntSmall(b *testing.B) {
+	// Benchmark pour k<=6 (utilise la table)
+	kmer := "acgtac"
+	kmerCode := EncodeKmer(kmer)
+	kmerSize := len(kmer)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = NormalizeInt(kmerCode, kmerSize)
+	}
+}
+
+func BenchmarkNormalizeIntLarge(b *testing.B) {
+	// Benchmark pour k>6 (calcul à la volée)
+	kmer := "acgtacgtac"
+	kmerCode := EncodeKmer(kmer)
+	kmerSize := len(kmer)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = NormalizeInt(kmerCode, kmerSize)
+	}
+}
+
+func BenchmarkEncodeKmer(b *testing.B) {
+	kmer := "acgtacgt"
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = EncodeKmer(kmer)
+	}
+}
+
+func TestCanonicalKmerCount(t *testing.T) {
+	// Test exact counts for k=1 to 6
+	tests := []struct {
+		k        int
+		expected int
+	}{
+		{1, 4},
+		{2, 10},
+		{3, 24},
+		{4, 70},
+		{5, 208},
+		{6, 700},
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
+			result := CanonicalKmerCount(tt.k)
+			if result != tt.expected {
+				t.Errorf("CanonicalKmerCount(%d) = %d, want %d", tt.k, result, tt.expected)
+			}
+		})
+	}
+
+	// Verify counts match table sizes
+	for k := 1; k <= 6; k++ {
+		// Count unique canonical codes in the table
+		uniqueCodes := make(map[int]bool)
+		for _, canonicalCode := range LexicographicNormalizationInt[k] {
+			uniqueCodes[canonicalCode] = true
+		}
+
+		expected := len(uniqueCodes)
+		result := CanonicalKmerCount(k)
+
+		if result != expected {
+			t.Errorf("CanonicalKmerCount(%d) = %d, but table has %d unique canonical codes",
+				k, result, expected)
+		}
+	}
+}
+
+func TestNecklaceCountFormula(t *testing.T) {
+	// Verify Moreau's formula gives the same results as hardcoded values for k=1 to 6
+	// and compute exact values for k=7+
+	tests := []struct {
+		k        int
+		expected int
+	}{
+		{1, 4},
+		{2, 10},
+		{3, 24},
+		{4, 70},
+		{5, 208},
+		{6, 700},
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
+			result := necklaceCount(tt.k, 4)
+			if result != tt.expected {
+				t.Errorf("necklaceCount(%d, 4) = %d, want %d", tt.k, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestNecklaceCountByBruteForce(t *testing.T) {
+	// Verify necklace count for k=7 and k=8 by brute force
+	// Generate all 4^k k-mers and count unique normalized ones
+	bases := []byte{'a', 'c', 'g', 't'}
+
+	for _, k := range []int{7, 8} {
+		t.Run(fmt.Sprintf("k=%d", k), func(t *testing.T) {
+			unique := make(map[int]bool)
+
+			// Generate all possible k-mers
+			var generate func(current int, depth int)
+			generate = func(current int, depth int) {
+				if depth == k {
+					// Normalize and add to set
+					normalized := NormalizeInt(current, k)
+					unique[normalized] = true
+					return
+				}
+
+				for _, base := range bases {
+					newCode := (current << 2) | int(EncodeNucleotide(base))
+					generate(newCode, depth+1)
+				}
+			}
+
+			generate(0, 0)
+
+			bruteForceCount := len(unique)
+			formulaCount := necklaceCount(k, 4)
+
+			if bruteForceCount != formulaCount {
+				t.Errorf("For k=%d: brute force count = %d, formula count = %d",
+					k, bruteForceCount, formulaCount)
+			}
+
+			t.Logf("k=%d: unique canonical k-mers = %d (formula matches brute force)", k, bruteForceCount)
+		})
+	}
+}
+
+func TestEulerTotient(t *testing.T) {
+	tests := []struct {
+		n        int
+		expected int
+	}{
+		{1, 1},
+		{2, 1},
+		{3, 2},
+		{4, 2},
+		{5, 4},
+		{6, 2},
+		{7, 6},
+		{8, 4},
+		{9, 6},
+		{10, 4},
+		{12, 4},
+		{15, 8},
+		{20, 8},
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("φ(%d)", tt.n), func(t *testing.T) {
+			result := eulerTotient(tt.n)
+			if result != tt.expected {
+				t.Errorf("eulerTotient(%d) = %d, want %d", tt.n, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestDivisors(t *testing.T) {
+	tests := []struct {
+		n        int
+		expected []int
+	}{
+		{1, []int{1}},
+		{2, []int{1, 2}},
+		{6, []int{1, 2, 3, 6}},
+		{12, []int{1, 2, 3, 4, 6, 12}},
+		{15, []int{1, 3, 5, 15}},
+		{20, []int{1, 2, 4, 5, 10, 20}},
+	}
+
+	for _, tt := range tests {
+		t.Run(fmt.Sprintf("divisors(%d)", tt.n), func(t *testing.T) {
+			result := divisors(tt.n)
+			if len(result) != len(tt.expected) {
+				t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
+				return
+			}
+			for i := range result {
+				if result[i] != tt.expected[i] {
+					t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
+					return
+				}
+			}
+		})
+	}
+}