mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
refactoring of obikmer
This commit is contained in:
@@ -619,6 +619,8 @@ func ReverseComplement(kmer uint64, k int) uint64 {
|
|||||||
// reverse complement. This canonical form ensures that a k-mer and its
|
// reverse complement. This canonical form ensures that a k-mer and its
|
||||||
// reverse complement map to the same value.
|
// reverse complement map to the same value.
|
||||||
//
|
//
|
||||||
|
// This implements REVERSE COMPLEMENT normalization (biological canonicalization).
|
||||||
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - kmer: the encoded k-mer
|
// - kmer: the encoded k-mer
|
||||||
// - k: the k-mer size (number of nucleotides)
|
// - k: the k-mer size (number of nucleotides)
|
||||||
@@ -633,6 +635,198 @@ func NormalizeKmer(kmer uint64, k int) uint64 {
|
|||||||
return kmer
|
return kmer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// NormalizeCircular returns the lexicographically smallest circular rotation
|
||||||
|
// of a k-mer. This is used for entropy calculations in low-complexity masking.
|
||||||
|
//
|
||||||
|
// This implements CIRCULAR PERMUTATION normalization (rotation-based canonicalization).
|
||||||
|
// Example: ACGT → min(ACGT, CGTA, GTAC, TACG) by circular rotation
|
||||||
|
//
|
||||||
|
// This is DIFFERENT from NormalizeKmer which uses reverse complement.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - kmer: the encoded k-mer
|
||||||
|
// - k: the k-mer size (number of nucleotides)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - the lexicographically smallest circular rotation
|
||||||
|
//
|
||||||
|
// Time complexity: O(k) - checks all k rotations
|
||||||
|
func NormalizeCircular(kmer uint64, k int) uint64 {
|
||||||
|
if k < 1 || k > 31 {
|
||||||
|
return kmer
|
||||||
|
}
|
||||||
|
|
||||||
|
mask := uint64(1)<<(k*2) - 1
|
||||||
|
canonical := kmer
|
||||||
|
current := kmer
|
||||||
|
|
||||||
|
// Try all k rotations
|
||||||
|
for i := 0; i < k; i++ {
|
||||||
|
// Rotate: take top 2 bits, shift left, add to bottom
|
||||||
|
top := (current >> ((k - 1) * 2)) & 3
|
||||||
|
current = ((current << 2) | top) & mask
|
||||||
|
|
||||||
|
if current < canonical {
|
||||||
|
canonical = current
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return canonical
|
||||||
|
}
|
||||||
|
|
||||||
|
// EncodeCircularNormalizedKmer encodes a k-mer and returns its lexicographically
|
||||||
|
// smallest circular rotation. This is optimized for single k-mer encoding with
|
||||||
|
// circular normalization.
|
||||||
|
//
|
||||||
|
// This implements CIRCULAR PERMUTATION normalization, used for entropy-based
|
||||||
|
// low-complexity masking. This is DIFFERENT from EncodeNormalizedKmer which
|
||||||
|
// uses reverse complement normalization.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
|
// - k: k-mer size (must be between 1 and 31)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - normalized k-mer as uint64 (smallest circular rotation)
|
||||||
|
// - panics if len(seq) != k or k is invalid
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// canonical := EncodeCircularNormalizedKmer([]byte("ACGT"), 4)
|
||||||
|
func EncodeCircularNormalizedKmer(seq []byte, k int) uint64 {
|
||||||
|
kmer := EncodeKmer(seq, k)
|
||||||
|
return NormalizeCircular(kmer, k)
|
||||||
|
}
|
||||||
|
|
||||||
|
// CanonicalCircularKmerCount returns the number of unique canonical k-mers
|
||||||
|
// under circular permutation normalization for DNA sequences (4-letter alphabet).
|
||||||
|
//
|
||||||
|
// This counts equivalence classes where k-mers are considered the same if one
|
||||||
|
// is a circular rotation of another (e.g., "ACGT", "CGTA", "GTAC", "TACG" are
|
||||||
|
// all equivalent).
|
||||||
|
//
|
||||||
|
// Uses Moreau's necklace-counting formula for exact counts:
|
||||||
|
//
|
||||||
|
// N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
||||||
|
//
|
||||||
|
// where the sum is over all divisors d of n, and φ is Euler's totient function.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: k-mer size
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - number of unique canonical k-mers under circular rotation
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// count := CanonicalCircularKmerCount(4) // Returns 70 (not 256)
|
||||||
|
func CanonicalCircularKmerCount(k int) int {
|
||||||
|
// Hardcoded exact counts for k=1 to 6 (optimization)
|
||||||
|
switch k {
|
||||||
|
case 1:
|
||||||
|
return 4
|
||||||
|
case 2:
|
||||||
|
return 10
|
||||||
|
case 3:
|
||||||
|
return 24
|
||||||
|
case 4:
|
||||||
|
return 70
|
||||||
|
case 5:
|
||||||
|
return 208
|
||||||
|
case 6:
|
||||||
|
return 700
|
||||||
|
default:
|
||||||
|
// For k>6, use Moreau's necklace-counting formula
|
||||||
|
return necklaceCount(k, 4)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// eulerTotient computes Euler's totient function φ(n), which counts
|
||||||
|
// the number of integers from 1 to n that are coprime with n.
|
||||||
|
func eulerTotient(n int) int {
|
||||||
|
if n <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
result := n
|
||||||
|
|
||||||
|
// Process all prime factors
|
||||||
|
for p := 2; p*p <= n; p++ {
|
||||||
|
if n%p == 0 {
|
||||||
|
// Remove all occurrences of p
|
||||||
|
for n%p == 0 {
|
||||||
|
n /= p
|
||||||
|
}
|
||||||
|
// Apply: φ(n) = n * (1 - 1/p) = n * (p-1)/p
|
||||||
|
result -= result / p
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If n is still greater than 1, then it's a prime factor
|
||||||
|
if n > 1 {
|
||||||
|
result -= result / n
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
// divisors returns all divisors of n in ascending order.
|
||||||
|
func divisors(n int) []int {
|
||||||
|
if n <= 0 {
|
||||||
|
return []int{}
|
||||||
|
}
|
||||||
|
|
||||||
|
divs := []int{}
|
||||||
|
for i := 1; i*i <= n; i++ {
|
||||||
|
if n%i == 0 {
|
||||||
|
divs = append(divs, i)
|
||||||
|
if i != n/i {
|
||||||
|
divs = append(divs, n/i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bubble sort in ascending order
|
||||||
|
for i := 0; i < len(divs)-1; i++ {
|
||||||
|
for j := i + 1; j < len(divs); j++ {
|
||||||
|
if divs[i] > divs[j] {
|
||||||
|
divs[i], divs[j] = divs[j], divs[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return divs
|
||||||
|
}
|
||||||
|
|
||||||
|
// necklaceCount computes the number of distinct necklaces (equivalence classes
|
||||||
|
// under rotation) for sequences of length n over an alphabet of size a.
|
||||||
|
// Uses Moreau's necklace-counting formula:
|
||||||
|
//
|
||||||
|
// N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
||||||
|
//
|
||||||
|
// where the sum is over all divisors d of n, and φ is Euler's totient function.
|
||||||
|
func necklaceCount(n, alphabetSize int) int {
|
||||||
|
if n <= 0 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
divs := divisors(n)
|
||||||
|
sum := 0
|
||||||
|
|
||||||
|
for _, d := range divs {
|
||||||
|
// Compute a^(n/d)
|
||||||
|
power := 1
|
||||||
|
exp := n / d
|
||||||
|
for i := 0; i < exp; i++ {
|
||||||
|
power *= alphabetSize
|
||||||
|
}
|
||||||
|
|
||||||
|
sum += eulerTotient(d) * power
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum / n
|
||||||
|
}
|
||||||
|
|
||||||
// EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers
|
// EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers
|
||||||
// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
|
// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
|
||||||
//
|
//
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,77 +0,0 @@
|
|||||||
package obikmer
|
|
||||||
|
|
||||||
import "testing"
|
|
||||||
|
|
||||||
func TestNormalize(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
kmer string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
// Test avec k=1
|
|
||||||
{"k=1 a", "a", "a"},
|
|
||||||
{"k=1 c", "c", "c"},
|
|
||||||
|
|
||||||
// Test avec k=2
|
|
||||||
{"k=2 ca", "ca", "ac"},
|
|
||||||
{"k=2 ac", "ac", "ac"},
|
|
||||||
|
|
||||||
// Test avec k=4
|
|
||||||
{"k=4 acgt", "acgt", "acgt"},
|
|
||||||
{"k=4 cgta", "cgta", "acgt"},
|
|
||||||
{"k=4 gtac", "gtac", "acgt"},
|
|
||||||
{"k=4 tacg", "tacg", "acgt"},
|
|
||||||
{"k=4 tgca", "tgca", "atgc"},
|
|
||||||
|
|
||||||
// Test avec k=6
|
|
||||||
{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
|
|
||||||
{"k=6 tttttt", "tttttt", "tttttt"},
|
|
||||||
|
|
||||||
// Test avec k>6 (calcul à la volée)
|
|
||||||
{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
|
|
||||||
{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
|
|
||||||
{"k=7 gcatgct", "gcatgct", "atgctgc"},
|
|
||||||
{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
|
|
||||||
{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
|
|
||||||
{"k=10 acgtacgtac", "acgtacgtac", "acacgtacgt"},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
result := Normalize(tt.kmer)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Errorf("Normalize(%q) = %q, want %q", tt.kmer, result, tt.expected)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeTableConsistency(t *testing.T) {
|
|
||||||
// Vérifier que tous les kmers de la table donnent le bon résultat
|
|
||||||
// en comparant avec le calcul à la volée
|
|
||||||
for kmer, expected := range LexicographicNormalization {
|
|
||||||
calculated := getCanonicalCircular(kmer)
|
|
||||||
if calculated != expected {
|
|
||||||
t.Errorf("Table inconsistency for %q: table=%q, calculated=%q",
|
|
||||||
kmer, expected, calculated)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkNormalizeSmall(b *testing.B) {
|
|
||||||
// Benchmark pour k<=6 (utilise la table)
|
|
||||||
kmer := "acgtac"
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
_ = Normalize(kmer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkNormalizeLarge(b *testing.B) {
|
|
||||||
// Benchmark pour k>6 (calcul à la volée)
|
|
||||||
kmer := "acgtacgtac"
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
_ = Normalize(kmer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,357 +0,0 @@
|
|||||||
package obikmer
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
"testing"
|
|
||||||
)
|
|
||||||
|
|
||||||
func TestEncodeDecodeKmer(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
kmer string
|
|
||||||
code int
|
|
||||||
}{
|
|
||||||
{"a", 0},
|
|
||||||
{"c", 1},
|
|
||||||
{"g", 2},
|
|
||||||
{"t", 3},
|
|
||||||
{"aa", 0},
|
|
||||||
{"ac", 1},
|
|
||||||
{"ca", 4},
|
|
||||||
{"acgt", 27}, // 0b00011011
|
|
||||||
{"cgta", 108}, // 0b01101100
|
|
||||||
{"tttt", 255}, // 0b11111111
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.kmer, func(t *testing.T) {
|
|
||||||
// Test encoding
|
|
||||||
encoded := EncodeKmer(tt.kmer)
|
|
||||||
if encoded != tt.code {
|
|
||||||
t.Errorf("EncodeKmer(%q) = %d, want %d", tt.kmer, encoded, tt.code)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test decoding
|
|
||||||
decoded := DecodeKmer(tt.code, len(tt.kmer))
|
|
||||||
if decoded != tt.kmer {
|
|
||||||
t.Errorf("DecodeKmer(%d, %d) = %q, want %q", tt.code, len(tt.kmer), decoded, tt.kmer)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeInt(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
name string
|
|
||||||
kmer string
|
|
||||||
expected string
|
|
||||||
}{
|
|
||||||
// Test avec k=1
|
|
||||||
{"k=1 a", "a", "a"},
|
|
||||||
{"k=1 c", "c", "c"},
|
|
||||||
|
|
||||||
// Test avec k=2
|
|
||||||
{"k=2 ca", "ca", "ac"},
|
|
||||||
{"k=2 ac", "ac", "ac"},
|
|
||||||
{"k=2 ta", "ta", "at"},
|
|
||||||
|
|
||||||
// Test avec k=4 - toutes les rotations de "acgt"
|
|
||||||
{"k=4 acgt", "acgt", "acgt"},
|
|
||||||
{"k=4 cgta", "cgta", "acgt"},
|
|
||||||
{"k=4 gtac", "gtac", "acgt"},
|
|
||||||
{"k=4 tacg", "tacg", "acgt"},
|
|
||||||
|
|
||||||
// Test avec k=4 - rotations de "tgca"
|
|
||||||
{"k=4 tgca", "tgca", "atgc"},
|
|
||||||
{"k=4 gcat", "gcat", "atgc"},
|
|
||||||
{"k=4 catg", "catg", "atgc"},
|
|
||||||
{"k=4 atgc", "atgc", "atgc"},
|
|
||||||
|
|
||||||
// Test avec k=3 - rotations de "atg"
|
|
||||||
{"k=3 atg", "atg", "atg"},
|
|
||||||
{"k=3 tga", "tga", "atg"},
|
|
||||||
{"k=3 gat", "gat", "atg"},
|
|
||||||
|
|
||||||
// Test avec k=6
|
|
||||||
{"k=6 aaaaaa", "aaaaaa", "aaaaaa"},
|
|
||||||
{"k=6 tttttt", "tttttt", "tttttt"},
|
|
||||||
|
|
||||||
// Test avec k>6 (calcul à la volée)
|
|
||||||
{"k=7 aaaaaaa", "aaaaaaa", "aaaaaaa"},
|
|
||||||
{"k=7 tgcatgc", "tgcatgc", "atgctgc"},
|
|
||||||
{"k=7 gcatgct", "gcatgct", "atgctgc"},
|
|
||||||
{"k=8 acgtacgt", "acgtacgt", "acgtacgt"},
|
|
||||||
{"k=8 gtacgtac", "gtacgtac", "acgtacgt"},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(tt.name, func(t *testing.T) {
|
|
||||||
kmerCode := EncodeKmer(tt.kmer)
|
|
||||||
expectedCode := EncodeKmer(tt.expected)
|
|
||||||
|
|
||||||
result := NormalizeInt(kmerCode, len(tt.kmer))
|
|
||||||
|
|
||||||
if result != expectedCode {
|
|
||||||
resultKmer := DecodeKmer(result, len(tt.kmer))
|
|
||||||
t.Errorf("NormalizeInt(%q) = %q (code %d), want %q (code %d)",
|
|
||||||
tt.kmer, resultKmer, result, tt.expected, expectedCode)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNormalizeIntConsistencyWithString(t *testing.T) {
|
|
||||||
// Vérifier que NormalizeInt donne le même résultat que Normalize
|
|
||||||
// pour tous les k-mers de taille 1 à 4 (pour ne pas trop ralentir les tests)
|
|
||||||
bases := []byte{'a', 'c', 'g', 't'}
|
|
||||||
|
|
||||||
var testKmers func(current string, maxSize int)
|
|
||||||
testKmers = func(current string, maxSize int) {
|
|
||||||
if len(current) > 0 {
|
|
||||||
// Test normalization
|
|
||||||
normalizedStr := Normalize(current)
|
|
||||||
normalizedStrCode := EncodeKmer(normalizedStr)
|
|
||||||
|
|
||||||
kmerCode := EncodeKmer(current)
|
|
||||||
normalizedInt := NormalizeInt(kmerCode, len(current))
|
|
||||||
|
|
||||||
if normalizedInt != normalizedStrCode {
|
|
||||||
normalizedIntStr := DecodeKmer(normalizedInt, len(current))
|
|
||||||
t.Errorf("Inconsistency for %q: Normalize=%q (code %d), NormalizeInt=%q (code %d)",
|
|
||||||
current, normalizedStr, normalizedStrCode, normalizedIntStr, normalizedInt)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(current) < maxSize {
|
|
||||||
for _, base := range bases {
|
|
||||||
testKmers(current+string(base), maxSize)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
testKmers("", 4) // Test jusqu'à k=4 pour rester raisonnable
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestCircularRotations(t *testing.T) {
|
|
||||||
// Test que toutes les rotations circulaires donnent le même canonical
|
|
||||||
tests := []struct {
|
|
||||||
kmers []string
|
|
||||||
canonical string
|
|
||||||
}{
|
|
||||||
{[]string{"atg", "tga", "gat"}, "atg"},
|
|
||||||
{[]string{"acgt", "cgta", "gtac", "tacg"}, "acgt"},
|
|
||||||
{[]string{"tgca", "gcat", "catg", "atgc"}, "atgc"},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
canonicalCode := EncodeKmer(tt.canonical)
|
|
||||||
|
|
||||||
for _, kmer := range tt.kmers {
|
|
||||||
kmerCode := EncodeKmer(kmer)
|
|
||||||
result := NormalizeInt(kmerCode, len(kmer))
|
|
||||||
|
|
||||||
if result != canonicalCode {
|
|
||||||
resultKmer := DecodeKmer(result, len(kmer))
|
|
||||||
t.Errorf("NormalizeInt(%q) = %q, want %q", kmer, resultKmer, tt.canonical)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkNormalizeIntSmall(b *testing.B) {
|
|
||||||
// Benchmark pour k<=6 (utilise la table)
|
|
||||||
kmer := "acgtac"
|
|
||||||
kmerCode := EncodeKmer(kmer)
|
|
||||||
kmerSize := len(kmer)
|
|
||||||
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
_ = NormalizeInt(kmerCode, kmerSize)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkNormalizeIntLarge(b *testing.B) {
|
|
||||||
// Benchmark pour k>6 (calcul à la volée)
|
|
||||||
kmer := "acgtacgtac"
|
|
||||||
kmerCode := EncodeKmer(kmer)
|
|
||||||
kmerSize := len(kmer)
|
|
||||||
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
_ = NormalizeInt(kmerCode, kmerSize)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func BenchmarkEncodeKmer(b *testing.B) {
|
|
||||||
kmer := "acgtacgt"
|
|
||||||
|
|
||||||
b.ResetTimer()
|
|
||||||
for i := 0; i < b.N; i++ {
|
|
||||||
_ = EncodeKmer(kmer)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestCanonicalKmerCount(t *testing.T) {
|
|
||||||
// Test exact counts for k=1 to 6
|
|
||||||
tests := []struct {
|
|
||||||
k int
|
|
||||||
expected int
|
|
||||||
}{
|
|
||||||
{1, 4},
|
|
||||||
{2, 10},
|
|
||||||
{3, 24},
|
|
||||||
{4, 70},
|
|
||||||
{5, 208},
|
|
||||||
{6, 700},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
|
|
||||||
result := CanonicalKmerCount(tt.k)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Errorf("CanonicalKmerCount(%d) = %d, want %d", tt.k, result, tt.expected)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify counts match table sizes
|
|
||||||
for k := 1; k <= 6; k++ {
|
|
||||||
// Count unique canonical codes in the table
|
|
||||||
uniqueCodes := make(map[int]bool)
|
|
||||||
for _, canonicalCode := range LexicographicNormalizationInt[k] {
|
|
||||||
uniqueCodes[canonicalCode] = true
|
|
||||||
}
|
|
||||||
|
|
||||||
expected := len(uniqueCodes)
|
|
||||||
result := CanonicalKmerCount(k)
|
|
||||||
|
|
||||||
if result != expected {
|
|
||||||
t.Errorf("CanonicalKmerCount(%d) = %d, but table has %d unique canonical codes",
|
|
||||||
k, result, expected)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNecklaceCountFormula(t *testing.T) {
|
|
||||||
// Verify Moreau's formula gives the same results as hardcoded values for k=1 to 6
|
|
||||||
// and compute exact values for k=7+
|
|
||||||
tests := []struct {
|
|
||||||
k int
|
|
||||||
expected int
|
|
||||||
}{
|
|
||||||
{1, 4},
|
|
||||||
{2, 10},
|
|
||||||
{3, 24},
|
|
||||||
{4, 70},
|
|
||||||
{5, 208},
|
|
||||||
{6, 700},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(fmt.Sprintf("k=%d", tt.k), func(t *testing.T) {
|
|
||||||
result := necklaceCount(tt.k, 4)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Errorf("necklaceCount(%d, 4) = %d, want %d", tt.k, result, tt.expected)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestNecklaceCountByBruteForce(t *testing.T) {
|
|
||||||
// Verify necklace count for k=7 and k=8 by brute force
|
|
||||||
// Generate all 4^k k-mers and count unique normalized ones
|
|
||||||
bases := []byte{'a', 'c', 'g', 't'}
|
|
||||||
|
|
||||||
for _, k := range []int{7, 8} {
|
|
||||||
t.Run(fmt.Sprintf("k=%d", k), func(t *testing.T) {
|
|
||||||
unique := make(map[int]bool)
|
|
||||||
|
|
||||||
// Generate all possible k-mers
|
|
||||||
var generate func(current int, depth int)
|
|
||||||
generate = func(current int, depth int) {
|
|
||||||
if depth == k {
|
|
||||||
// Normalize and add to set
|
|
||||||
normalized := NormalizeInt(current, k)
|
|
||||||
unique[normalized] = true
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, base := range bases {
|
|
||||||
newCode := (current << 2) | int(EncodeNucleotide(base))
|
|
||||||
generate(newCode, depth+1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
generate(0, 0)
|
|
||||||
|
|
||||||
bruteForceCount := len(unique)
|
|
||||||
formulaCount := necklaceCount(k, 4)
|
|
||||||
|
|
||||||
if bruteForceCount != formulaCount {
|
|
||||||
t.Errorf("For k=%d: brute force count = %d, formula count = %d",
|
|
||||||
k, bruteForceCount, formulaCount)
|
|
||||||
}
|
|
||||||
|
|
||||||
t.Logf("k=%d: unique canonical k-mers = %d (formula matches brute force)", k, bruteForceCount)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestEulerTotient(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
n int
|
|
||||||
expected int
|
|
||||||
}{
|
|
||||||
{1, 1},
|
|
||||||
{2, 1},
|
|
||||||
{3, 2},
|
|
||||||
{4, 2},
|
|
||||||
{5, 4},
|
|
||||||
{6, 2},
|
|
||||||
{7, 6},
|
|
||||||
{8, 4},
|
|
||||||
{9, 6},
|
|
||||||
{10, 4},
|
|
||||||
{12, 4},
|
|
||||||
{15, 8},
|
|
||||||
{20, 8},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(fmt.Sprintf("φ(%d)", tt.n), func(t *testing.T) {
|
|
||||||
result := eulerTotient(tt.n)
|
|
||||||
if result != tt.expected {
|
|
||||||
t.Errorf("eulerTotient(%d) = %d, want %d", tt.n, result, tt.expected)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestDivisors(t *testing.T) {
|
|
||||||
tests := []struct {
|
|
||||||
n int
|
|
||||||
expected []int
|
|
||||||
}{
|
|
||||||
{1, []int{1}},
|
|
||||||
{2, []int{1, 2}},
|
|
||||||
{6, []int{1, 2, 3, 6}},
|
|
||||||
{12, []int{1, 2, 3, 4, 6, 12}},
|
|
||||||
{15, []int{1, 3, 5, 15}},
|
|
||||||
{20, []int{1, 2, 4, 5, 10, 20}},
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, tt := range tests {
|
|
||||||
t.Run(fmt.Sprintf("divisors(%d)", tt.n), func(t *testing.T) {
|
|
||||||
result := divisors(tt.n)
|
|
||||||
if len(result) != len(tt.expected) {
|
|
||||||
t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
for i := range result {
|
|
||||||
if result[i] != tt.expected[i] {
|
|
||||||
t.Errorf("divisors(%d) = %v, want %v", tt.n, result, tt.expected)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "c5dd477"
|
var _Commit = "6c6c369"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
@@ -48,12 +48,12 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
// - We calculate the entropy of a distribution where all words appear
|
// - We calculate the entropy of a distribution where all words appear
|
||||||
// cov or cov+1 times (most uniform distribution possible)
|
// cov or cov+1 times (most uniform distribution possible)
|
||||||
//
|
//
|
||||||
// IMPORTANT: Uses CanonicalKmerCount to get the actual number of canonical words
|
// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
|
||||||
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
||||||
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
||||||
emax := func(lseq, word_size int) float64 {
|
emax := func(lseq, word_size int) float64 {
|
||||||
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
||||||
na := obikmer.CanonicalKmerCount(word_size) // Number of canonical words after normalization
|
na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization
|
||||||
|
|
||||||
// Case 1: Fewer positions than possible words
|
// Case 1: Fewer positions than possible words
|
||||||
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
||||||
@@ -215,7 +215,8 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
// *** CIRCULAR NORMALIZATION ***
|
// *** CIRCULAR NORMALIZATION ***
|
||||||
// Convert word to its canonical form (smallest by circular rotation)
|
// Convert word to its canonical form (smallest by circular rotation)
|
||||||
// This is where "atg", "tga", "gat" all become "atg"
|
// This is where "atg", "tga", "gat" all become "atg"
|
||||||
words[i] = obikmer.NormalizeInt(word_index, wordSize)
|
// Now using uint64-based NormalizeCircular for better performance
|
||||||
|
words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
|
||||||
}
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
|
|||||||
Reference in New Issue
Block a user