diff --git a/pkg/obikmer/entropy.go b/pkg/obikmer/entropy.go index 94f8ab7..8281460 100644 --- a/pkg/obikmer/entropy.go +++ b/pkg/obikmer/entropy.go @@ -4,22 +4,21 @@ import "math" // KmerEntropy computes the entropy of a single encoded k-mer. // -// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer +// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax, // normalizes them by circular canonical form, counts their frequencies, and -// computes Shannon entropy normalized by the maximum possible entropy. +// computes Shannon entropy corrected for class sizes, normalized by the +// maximum possible entropy over 4^ws raw bins. // The returned value is the minimum entropy across all word sizes. // +// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N +// under-estimates the true complexity when many raw words collapse to the same +// canonical form. Adding Σ f·log(class_size)/N recovers the entropy of the +// underlying uncollapsed distribution (assuming uniform mixing within each +// equivalence class). +// // A value close to 0 indicates very low complexity (e.g. "AAAA..."), // while a value close to 1 indicates high complexity. -// -// Parameters: -// - kmer: the encoded k-mer (2 bits per base) -// - k: the k-mer size -// - levelMax: maximum sub-word size for entropy (typically 6) -// -// Returns: -// - minimum normalized entropy across all word sizes 1..levelMax func KmerEntropy(kmer uint64, k int, levelMax int) float64 { if k < 1 || levelMax < 1 { return 1.0 @@ -35,7 +34,7 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 { var seqBuf [32]byte seq := DecodeKmer(kmer, k, seqBuf[:]) - // Pre-compute nLogN lookup (same as lowmask) + // Pre-compute nLogN lookup nLogN := make([]float64, k+1) for i := 1; i <= k; i++ { nLogN[i] = float64(i) * math.Log(float64(i)) @@ -51,6 +50,23 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 { } } + // Build ln(class_size) tables: for each canonical form, how many raw + // words map to it under circular normalization. + classLogSizeTables := make([][]float64, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + tableSize := 1 << (ws * 2) + classSize := make([]int, tableSize) + for code := 0; code < tableSize; code++ { + classSize[normTables[ws][code]]++ + } + classLogSizeTables[ws] = make([]float64, tableSize) + for j := 0; j < tableSize; j++ { + if classSize[j] > 0 { + classLogSizeTables[ws][j] = math.Log(float64(classSize[j])) + } + } + } + minEntropy := math.MaxFloat64 for ws := 1; ws <= levelMax; ws++ { @@ -75,23 +91,13 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 { table[normWord]++ } - // Compute Shannon entropy + // Compute emax over 4^ws raw bins (uncollapsed distribution). floatNwords := float64(nwords) logNwords := math.Log(floatNwords) - - var sumNLogN float64 - for j := 0; j < tableSize; j++ { - n := table[j] - if n > 0 { - sumNLogN += nLogN[n] - } - } - - // Compute emax (maximum possible entropy for this word size) - na := CanonicalCircularKmerCount(ws) + na := tableSize // 4^ws var emax float64 if nwords < na { - emax = math.Log(float64(nwords)) + emax = logNwords } else { cov := nwords / na remains := nwords - (na * cov) @@ -105,7 +111,19 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 { continue } - entropy := (logNwords - sumNLogN/floatNwords) / emax + // Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms. + classLogSize := classLogSizeTables[ws] + var sumNLogN, sumClassLogN float64 + for j := 0; j < tableSize; j++ { + n := table[j] + if n > 0 { + sumNLogN += nLogN[n] + sumClassLogN += float64(n) * classLogSize[j] + } + } + + // Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N + entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax if entropy < 0 { entropy = 0 } @@ -129,24 +147,20 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 { // IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use. // Each goroutine must create its own instance via NewKmerEntropyFilter. type KmerEntropyFilter struct { - k int - levelMax int - threshold float64 - nLogN []float64 - normTables [][]int - emaxValues []float64 - logNwords []float64 + k int + levelMax int + threshold float64 + nLogN []float64 + normTables [][]int + classLogSizeTables [][]float64 + emaxValues []float64 + logNwords []float64 // Pre-allocated frequency tables reused across Entropy() calls. // One per word size (index 0 unused). Reset to zero before each use. freqTables [][]int } // NewKmerEntropyFilter creates an entropy filter with pre-computed tables. -// -// Parameters: -// - k: the k-mer size -// - levelMax: maximum sub-word size for entropy (typically 6) -// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected) func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter { if levelMax >= k { levelMax = k - 1 @@ -169,20 +183,38 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter } } + // ln(class_size) for each canonical form under circular normalization. + classLogSizeTables := make([][]float64, levelMax+1) + for ws := 1; ws <= levelMax; ws++ { + tableSize := 1 << (ws * 2) + classSize := make([]int, tableSize) + for code := 0; code < tableSize; code++ { + classSize[normTables[ws][code]]++ + } + classLogSizeTables[ws] = make([]float64, tableSize) + for j := 0; j < tableSize; j++ { + if classSize[j] > 0 { + classLogSizeTables[ws][j] = math.Log(float64(classSize[j])) + } + } + } + + // Pre-compute emax and logNwords per word size. + // emax uses 4^ws raw bins to match the corrected entropy. emaxValues := make([]float64, levelMax+1) logNwords := make([]float64, levelMax+1) for ws := 1; ws <= levelMax; ws++ { nw := k - ws + 1 - na := CanonicalCircularKmerCount(ws) + na := 1 << (ws * 2) // 4^ws raw bins + floatNw := float64(nw) + logNwords[ws] = math.Log(floatNw) if nw < na { - logNwords[ws] = math.Log(float64(nw)) - emaxValues[ws] = math.Log(float64(nw)) + emaxValues[ws] = logNwords[ws] } else { cov := nw / na remains := nw - (na * cov) - f1 := float64(cov) / float64(nw) - f2 := float64(cov+1) / float64(nw) - logNwords[ws] = math.Log(float64(nw)) + f1 := float64(cov) / floatNw + f2 := float64(cov+1) / floatNw emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) + float64(remains)*f2*math.Log(f2)) } @@ -195,14 +227,15 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter } return &KmerEntropyFilter{ - k: k, - levelMax: levelMax, - threshold: threshold, - nLogN: nLogN, - normTables: normTables, - emaxValues: emaxValues, - logNwords: logNwords, - freqTables: freqTables, + k: k, + levelMax: levelMax, + threshold: threshold, + nLogN: nLogN, + normTables: normTables, + classLogSizeTables: classLogSizeTables, + emaxValues: emaxValues, + logNwords: logNwords, + freqTables: freqTables, } } @@ -236,7 +269,7 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 { // Count circular-canonical sub-word frequencies tableSize := 1 << (ws * 2) table := ef.freqTables[ws] - clear(table) // reset to zero + clear(table) mask := (1 << (ws * 2)) - 1 normTable := ef.normTables[ws] @@ -251,19 +284,21 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 { table[normWord]++ } - // Compute Shannon entropy floatNwords := float64(nwords) logNwords := ef.logNwords[ws] + classLogSize := ef.classLogSizeTables[ws] - var sumNLogN float64 + var sumNLogN, sumClassLogN float64 for j := 0; j < tableSize; j++ { n := table[j] if n > 0 { sumNLogN += ef.nLogN[n] + sumClassLogN += float64(n) * classLogSize[j] } } - entropy := (logNwords - sumNLogN/floatNwords) / emax + // Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N + entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax if entropy < 0 { entropy = 0 }