refactoring of obikmer

2026-05-09 23:50:38 +00:00 · 2026-02-05 15:56:22 +01:00
parent 6c6c369ee2
commit 16f72e6305
7 changed files with 200 additions and 11612 deletions
@@ -619,6 +619,8 @@ func ReverseComplement(kmer uint64, k int) uint64 {
 // reverse complement. This canonical form ensures that a k-mer and its
 // reverse complement map to the same value.
 //
+// This implements REVERSE COMPLEMENT normalization (biological canonicalization).
+//
 // Parameters:
 //   - kmer: the encoded k-mer
 //   - k: the k-mer size (number of nucleotides)
@@ -633,6 +635,198 @@ func NormalizeKmer(kmer uint64, k int) uint64 {
 	return kmer
 }

+// NormalizeCircular returns the lexicographically smallest circular rotation
+// of a k-mer. This is used for entropy calculations in low-complexity masking.
+//
+// This implements CIRCULAR PERMUTATION normalization (rotation-based canonicalization).
+// Example: ACGT → min(ACGT, CGTA, GTAC, TACG) by circular rotation
+//
+// This is DIFFERENT from NormalizeKmer which uses reverse complement.
+//
+// Parameters:
+//   - kmer: the encoded k-mer
+//   - k: the k-mer size (number of nucleotides)
+//
+// Returns:
+//   - the lexicographically smallest circular rotation
+//
+// Time complexity: O(k) - checks all k rotations
+func NormalizeCircular(kmer uint64, k int) uint64 {
+	if k < 1 || k > 31 {
+		return kmer
+	}
+
+	mask := uint64(1)<<(k*2) - 1
+	canonical := kmer
+	current := kmer
+
+	// Try all k rotations
+	for i := 0; i < k; i++ {
+		// Rotate: take top 2 bits, shift left, add to bottom
+		top := (current >> ((k - 1) * 2)) & 3
+		current = ((current << 2) | top) & mask
+
+		if current < canonical {
+			canonical = current
+		}
+	}
+
+	return canonical
+}
+
+// EncodeCircularNormalizedKmer encodes a k-mer and returns its lexicographically
+// smallest circular rotation. This is optimized for single k-mer encoding with
+// circular normalization.
+//
+// This implements CIRCULAR PERMUTATION normalization, used for entropy-based
+// low-complexity masking. This is DIFFERENT from EncodeNormalizedKmer which
+// uses reverse complement normalization.
+//
+// Parameters:
+//   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
+//   - k: k-mer size (must be between 1 and 31)
+//
+// Returns:
+//   - normalized k-mer as uint64 (smallest circular rotation)
+//   - panics if len(seq) != k or k is invalid
+//
+// Example:
+//
+//	canonical := EncodeCircularNormalizedKmer([]byte("ACGT"), 4)
+func EncodeCircularNormalizedKmer(seq []byte, k int) uint64 {
+	kmer := EncodeKmer(seq, k)
+	return NormalizeCircular(kmer, k)
+}
+
+// CanonicalCircularKmerCount returns the number of unique canonical k-mers
+// under circular permutation normalization for DNA sequences (4-letter alphabet).
+//
+// This counts equivalence classes where k-mers are considered the same if one
+// is a circular rotation of another (e.g., "ACGT", "CGTA", "GTAC", "TACG" are
+// all equivalent).
+//
+// Uses Moreau's necklace-counting formula for exact counts:
+//
+//	N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
+//
+// where the sum is over all divisors d of n, and φ is Euler's totient function.
+//
+// Parameters:
+//   - k: k-mer size
+//
+// Returns:
+//   - number of unique canonical k-mers under circular rotation
+//
+// Example:
+//
+//	count := CanonicalCircularKmerCount(4) // Returns 70 (not 256)
+func CanonicalCircularKmerCount(k int) int {
+	// Hardcoded exact counts for k=1 to 6 (optimization)
+	switch k {
+	case 1:
+		return 4
+	case 2:
+		return 10
+	case 3:
+		return 24
+	case 4:
+		return 70
+	case 5:
+		return 208
+	case 6:
+		return 700
+	default:
+		// For k>6, use Moreau's necklace-counting formula
+		return necklaceCount(k, 4)
+	}
+}
+
+// eulerTotient computes Euler's totient function φ(n), which counts
+// the number of integers from 1 to n that are coprime with n.
+func eulerTotient(n int) int {
+	if n <= 0 {
+		return 0
+	}
+
+	result := n
+
+	// Process all prime factors
+	for p := 2; p*p <= n; p++ {
+		if n%p == 0 {
+			// Remove all occurrences of p
+			for n%p == 0 {
+				n /= p
+			}
+			// Apply: φ(n) = n * (1 - 1/p) = n * (p-1)/p
+			result -= result / p
+		}
+	}
+
+	// If n is still greater than 1, then it's a prime factor
+	if n > 1 {
+		result -= result / n
+	}
+
+	return result
+}
+
+// divisors returns all divisors of n in ascending order.
+func divisors(n int) []int {
+	if n <= 0 {
+		return []int{}
+	}
+
+	divs := []int{}
+	for i := 1; i*i <= n; i++ {
+		if n%i == 0 {
+			divs = append(divs, i)
+			if i != n/i {
+				divs = append(divs, n/i)
+			}
+		}
+	}
+
+	// Bubble sort in ascending order
+	for i := 0; i < len(divs)-1; i++ {
+		for j := i + 1; j < len(divs); j++ {
+			if divs[i] > divs[j] {
+				divs[i], divs[j] = divs[j], divs[i]
+			}
+		}
+	}
+
+	return divs
+}
+
+// necklaceCount computes the number of distinct necklaces (equivalence classes
+// under rotation) for sequences of length n over an alphabet of size a.
+// Uses Moreau's necklace-counting formula:
+//
+//	N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
+//
+// where the sum is over all divisors d of n, and φ is Euler's totient function.
+func necklaceCount(n, alphabetSize int) int {
+	if n <= 0 {
+		return 0
+	}
+
+	divs := divisors(n)
+	sum := 0
+
+	for _, d := range divs {
+		// Compute a^(n/d)
+		power := 1
+		exp := n / d
+		for i := 0; i < exp; i++ {
+			power *= alphabetSize
+		}
+
+		sum += eulerTotient(d) * power
+	}
+
+	return sum / n
+}
+
 // EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers
 // with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
 //
@@ -1,77 +0,0 @@
-package obikmer
-
-import "testing"
-
-func TestNormalize(t *testing.T) {
-	tests := []struct {
-		name     string
-		kmer     string
-		expected string
-	}{
-		// Test avec k=1
-		{"k=1 a", "a", "a"},
-		{"k=1 c", "c", "c"},
-
-		// Test avec k=2
-		{"k=2 ca", "ca", "ac"},
-		{"k=2 ac", "ac", "ac"},
-
-		// Test avec k=4
-		{"k=4 acgt", "acgt", "acgt"},
-		{"k=4 cgta", "cgta", "acgt"},
-		{"k=4 gtac", "gtac", "acgt"},
-		{"k=4 tacg", "tacg", "acgt"},
-		{"k=4 tgca", "tgca", "atgc"},
-
-		// Test avec k=6
-		{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
-		{"k=6 tttttt", "tttttt", "tttttt"},
-
-		// Test avec k>6 (calcul à la volée)
-		{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
-		{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
-		{"k=7 gcatgct", "gcatgct", "atgctgc"},
-		{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
-		{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
-		{"k=10 acgtacgtac", "acgtacgtac", "acacgtacgt"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := Normalize(tt.kmer)
-			if result != tt.expected {
-				t.Errorf("Normalize(%q) = %q, want %q", tt.kmer, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestNormalizeTableConsistency(t *testing.T) {
-	// Vérifier que tous les kmers de la table donnent le bon résultat
-	// en comparant avec le calcul à la volée
-	for kmer, expected := range LexicographicNormalization {
-		calculated := getCanonicalCircular(kmer)
-		if calculated != expected {
-			t.Errorf("Table inconsistency for %q: table=%q, calculated=%q",
-				kmer, expected, calculated)
-		}
-	}
-}
-
-func BenchmarkNormalizeSmall(b *testing.B) {
-	// Benchmark pour k<=6 (utilise la table)
-	kmer := "acgtac"
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = Normalize(kmer)
-	}
-}
-
-func BenchmarkNormalizeLarge(b *testing.B) {
-	// Benchmark pour k>6 (calcul à la volée)
-	kmer := "acgtacgtac"
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = Normalize(kmer)
-	}
-}
@@ -1,357 +0,0 @@
-package obikmer
-
-import (
-	"fmt"
-	"testing"
-)
-
-func TestEncodeDecodeKmer(t *testing.T) {
-	tests := []struct {
-		kmer string
-		code int
-	}{
-		{"a", 0},
-		{"c", 1},
-		{"g", 2},
-		{"t", 3},
-		{"aa", 0},
-		{"ac", 1},
-		{"ca", 4},
-		{"acgt", 27}, // 0b00011011
-		{"cgta", 108}, // 0b01101100
-		{"tttt", 255}, // 0b11111111
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.kmer, func(t *testing.T) {
-			// Test encoding
-			encoded := EncodeKmer(tt.kmer)
-			if encoded != tt.code {
-				t.Errorf("EncodeKmer(%q) = %d, want %d", tt.kmer, encoded, tt.code)
-			}
-
-			// Test decoding
-			decoded := DecodeKmer(tt.code, len(tt.kmer))
-			if decoded != tt.kmer {
-				t.Errorf("DecodeKmer(%d, %d) = %q, want %q", tt.code, len(tt.kmer), decoded, tt.kmer)
-			}
-		})
-	}
-}
-
-func TestNormalizeInt(t *testing.T) {
-	tests := []struct {
-		name     string
-		kmer     string
-		expected string
-	}{
-		// Test avec k=1
-		{"k=1 a", "a", "a"},
-		{"k=1 c", "c", "c"},
-
-		// Test avec k=2
-		{"k=2 ca", "ca", "ac"},
-		{"k=2 ac", "ac", "ac"},
-		{"k=2 ta", "ta", "at"},
-
-		// Test avec k=4 - toutes les rotations de "acgt"
-		{"k=4 acgt", "acgt", "acgt"},
-		{"k=4 cgta", "cgta", "acgt"},
-		{"k=4 gtac", "gtac", "acgt"},
-		{"k=4 tacg", "tacg", "acgt"},
-
-		// Test avec k=4 - rotations de "tgca"
-		{"k=4 tgca", "tgca", "atgc"},
-		{"k=4 gcat", "gcat", "atgc"},
-		{"k=4 catg", "catg", "atgc"},
-		{"k=4 atgc", "atgc", "atgc"},
-
-		// Test avec k=3 - rotations de "atg"
-		{"k=3 atg", "atg", "atg"},
-		{"k=3 tga", "tga", "atg"},
-		{"k=3 gat", "gat", "atg"},
-
-		// Test avec k=6
-		{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
-		{"k=6 tttttt", "tttttt", "tttttt"},
-
-		// Test avec k>6 (calcul à la volée)
-		{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
-		{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
-		{"k=7 gcatgct", "gcatgct", "atgctgc"},
-		{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
-		{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			kmerCode := EncodeKmer(tt.kmer)
-			expectedCode := EncodeKmer(tt.expected)
-
-			result := NormalizeInt(kmerCode, len(tt.kmer))
-
-			if result != expectedCode {
-				resultKmer := DecodeKmer(result, len(tt.kmer))
-				t.Errorf("NormalizeInt(%q) = %q (code %d), want %q (code %d)",
-					tt.kmer, resultKmer, result, tt.expected, expectedCode)
-			}
-		})
-	}
-}
-
-func TestNormalizeIntConsistencyWithString(t *testing.T) {
-	// Vérifier que NormalizeInt donne le même résultat que Normalize
-	// pour tous les k-mers de taille 1 à 4 (pour ne pas trop ralentir les tests)
-	bases := []byte{'a', 'c', 'g', 't'}
-
-	var testKmers func(current string, maxSize int)
-	testKmers = func(current string, maxSize int) {
-		if len(current) > 0 {
-			// Test normalization
-			normalizedStr := Normalize(current)
-			normalizedStrCode := EncodeKmer(normalizedStr)
-
-			kmerCode := EncodeKmer(current)
-			normalizedInt := NormalizeInt(kmerCode, len(current))
-
-			if normalizedInt != normalizedStrCode {
-				normalizedIntStr := DecodeKmer(normalizedInt, len(current))
-				t.Errorf("Inconsistency for %q: Normalize=%q (code %d), NormalizeInt=%q (code %d)",
-					current, normalizedStr, normalizedStrCode, normalizedIntStr, normalizedInt)
-			}
-		}
-
-		if len(current) < maxSize {
-			for _, base := range bases {
-				testKmers(current+string(base), maxSize)
-			}
-		}
-	}
-
-	testKmers("", 4) // Test jusqu'à k=4 pour rester raisonnable
-}
-
-func TestCircularRotations(t *testing.T) {
-	// Test que toutes les rotations circulaires donnent le même canonical
-	tests := []struct {
-		kmers []string
-		canonical string
-	}{
-		{[]string{"atg", "tga", "gat"}, "atg"},
-		{[]string{"acgt", "cgta", "gtac", "tacg"}, "acgt"},
-		{[]string{"tgca", "gcat", "catg", "atgc"}, "atgc"},
-	}
-
-	for _, tt := range tests {
-		canonicalCode := EncodeKmer(tt.canonical)
-
-		for _, kmer := range tt.kmers {
-			kmerCode := EncodeKmer(kmer)
-			result := NormalizeInt(kmerCode, len(kmer))
-
-			if result != canonicalCode {
-				resultKmer := DecodeKmer(result, len(kmer))
-				t.Errorf("NormalizeInt(%q) = %q, want %q", kmer, resultKmer, tt.canonical)
-			}
-		}
-	}
-}
-
-func BenchmarkNormalizeIntSmall(b *testing.B) {
-	// Benchmark pour k<=6 (utilise la table)
-	kmer := "acgtac"
-	kmerCode := EncodeKmer(kmer)
-	kmerSize := len(kmer)
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = NormalizeInt(kmerCode, kmerSize)
-	}
-}
-
-func BenchmarkNormalizeIntLarge(b *testing.B) {
-	// Benchmark pour k>6 (calcul à la volée)
-	kmer := "acgtacgtac"
-	kmerCode := EncodeKmer(kmer)
-	kmerSize := len(kmer)
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = NormalizeInt(kmerCode, kmerSize)
-	}
-}
-
-func BenchmarkEncodeKmer(b *testing.B) {
-	kmer := "acgtacgt"
-
-	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		_ = EncodeKmer(kmer)
-	}
-}
-
-func TestCanonicalKmerCount(t *testing.T) {
-	// Test exact counts for k=1 to 6
-	tests := []struct {
-		k        int
-		expected int
-	}{
-		{1, 4},
-		{2, 10},
-		{3, 24},
-		{4, 70},
-		{5, 208},
-		{6, 700},
-	}
-
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
-			result := CanonicalKmerCount(tt.k)
-			if result != tt.expected {
-				t.Errorf("CanonicalKmerCount(%d) = %d, want %d", tt.k, result, tt.expected)
-			}
-		})
-	}
-
-	// Verify counts match table sizes
-	for k := 1; k <= 6; k++ {
-		// Count unique canonical codes in the table
-		uniqueCodes := make(map[int]bool)
-		for _, canonicalCode := range LexicographicNormalizationInt[k] {
-			uniqueCodes[canonicalCode] = true
-		}
-
-		expected := len(uniqueCodes)
-		result := CanonicalKmerCount(k)
-
-		if result != expected {
-			t.Errorf("CanonicalKmerCount(%d) = %d, but table has %d unique canonical codes",
-				k, result, expected)
-		}
-	}
-}
-
-func TestNecklaceCountFormula(t *testing.T) {
-	// Verify Moreau's formula gives the same results as hardcoded values for k=1 to 6
-	// and compute exact values for k=7+
-	tests := []struct {
-		k        int
-		expected int
-	}{
-		{1, 4},
-		{2, 10},
-		{3, 24},
-		{4, 70},
-		{5, 208},
-		{6, 700},
-	}
-
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
-			result := necklaceCount(tt.k, 4)
-			if result != tt.expected {
-				t.Errorf("necklaceCount(%d, 4) = %d, want %d", tt.k, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestNecklaceCountByBruteForce(t *testing.T) {
-	// Verify necklace count for k=7 and k=8 by brute force
-	// Generate all 4^k k-mers and count unique normalized ones
-	bases := []byte{'a', 'c', 'g', 't'}
-
-	for _, k := range []int{7, 8} {
-		t.Run(fmt.Sprintf("k=%d", k), func(t *testing.T) {
-			unique := make(map[int]bool)
-
-			// Generate all possible k-mers
-			var generate func(current int, depth int)
-			generate = func(current int, depth int) {
-				if depth == k {
-					// Normalize and add to set
-					normalized := NormalizeInt(current, k)
-					unique[normalized] = true
-					return
-				}
-
-				for _, base := range bases {
-					newCode := (current << 2) | int(EncodeNucleotide(base))
-					generate(newCode, depth+1)
-				}
-			}
-
-			generate(0, 0)
-
-			bruteForceCount := len(unique)
-			formulaCount := necklaceCount(k, 4)
-
-			if bruteForceCount != formulaCount {
-				t.Errorf("For k=%d: brute force count = %d, formula count = %d",
-					k, bruteForceCount, formulaCount)
-			}
-
-			t.Logf("k=%d: unique canonical k-mers = %d (formula matches brute force)", k, bruteForceCount)
-		})
-	}
-}
-
-func TestEulerTotient(t *testing.T) {
-	tests := []struct {
-		n        int
-		expected int
-	}{
-		{1, 1},
-		{2, 1},
-		{3, 2},
-		{4, 2},
-		{5, 4},
-		{6, 2},
-		{7, 6},
-		{8, 4},
-		{9, 6},
-		{10, 4},
-		{12, 4},
-		{15, 8},
-		{20, 8},
-	}
-
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("φ(%d)", tt.n), func(t *testing.T) {
-			result := eulerTotient(tt.n)
-			if result != tt.expected {
-				t.Errorf("eulerTotient(%d) = %d, want %d", tt.n, result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestDivisors(t *testing.T) {
-	tests := []struct {
-		n        int
-		expected []int
-	}{
-		{1, []int{1}},
-		{2, []int{1, 2}},
-		{6, []int{1, 2, 3, 6}},
-		{12, []int{1, 2, 3, 4, 6, 12}},
-		{15, []int{1, 3, 5, 15}},
-		{20, []int{1, 2, 4, 5, 10, 20}},
-	}
-
-	for _, tt := range tests {
-		t.Run(fmt.Sprintf("divisors(%d)", tt.n), func(t *testing.T) {
-			result := divisors(tt.n)
-			if len(result) != len(tt.expected) {
-				t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
-				return
-			}
-			for i := range result {
-				if result[i] != tt.expected[i] {
-					t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
-					return
-				}
-			}
-		})
-	}
-}
@@ -8,7 +8,7 @@ import (
 // corresponds to the last commit, and not the one when the file will be
 // commited

-var _Commit = "c5dd477"
+var _Commit = "6c6c369"
 var _Version = "Release 4.4.0"

 // Version returns the version of the obitools package.
@@ -48,12 +48,12 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 	// - We calculate the entropy of a distribution where all words appear
 	//   cov or cov+1 times (most uniform distribution possible)
 	//
-	// IMPORTANT: Uses CanonicalKmerCount to get the actual number of canonical words
+	// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
 	// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
 	// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
 	emax := func(lseq, word_size int) float64 {
 		nw := lseq - word_size + 1                            // Number of words in a k-mer of length lseq
-		na := obikmer.CanonicalKmerCount(word_size) // Number of canonical words after normalization
+		na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization

 		// Case 1: Fewer positions than possible words
 		// Maximum entropy is simply log(nw) since we can have at most nw different words
@@ -215,7 +215,8 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
 			// *** CIRCULAR NORMALIZATION ***
 			// Convert word to its canonical form (smallest by circular rotation)
 			// This is where "atg", "tga", "gat" all become "atg"
-			words[i] = obikmer.NormalizeInt(word_index, wordSize)
+			// Now using uint64-based NormalizeCircular for better performance
+			words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
 		}

 		// ========================================================================