Correct Shannon entropy bias for canonical k-mers

Multiple raw k-mers collapsing into identical circular canonical forms introduce bias into complexity estimates. This change pre-computes `log(class_size)` tables and per-word-size maximum entropy bounds. The `KmerEntropy` function and `KmerEntropyFilter` are updated to apply the corrected formula `(log(N) + Σf·log(s) - Σf·log(f))/N / emax`, ensuring accurate sequence complexity estimation.
2026-06-24 01:31:00 +00:00 · 2026-05-17 14:52:31 +08:00
parent cecf90fa40
commit af7ae3d60c
1 changed files with 90 additions and 55 deletions
@@ -4,22 +4,21 @@ import "math"

 // KmerEntropy computes the entropy of a single encoded k-mer.
 //
-// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
+// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer
 // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
 // normalizes them by circular canonical form, counts their frequencies, and
-// computes Shannon entropy normalized by the maximum possible entropy.
+// computes Shannon entropy corrected for class sizes, normalized by the
+// maximum possible entropy over 4^ws raw bins.
 // The returned value is the minimum entropy across all word sizes.
 //
+// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N
+// under-estimates the true complexity when many raw words collapse to the same
+// canonical form.  Adding Σ f·log(class_size)/N recovers the entropy of the
+// underlying uncollapsed distribution (assuming uniform mixing within each
+// equivalence class).
+//
 // A value close to 0 indicates very low complexity (e.g. "AAAA..."),
 // while a value close to 1 indicates high complexity.
-//
-// Parameters:
-//   - kmer: the encoded k-mer (2 bits per base)
-//   - k: the k-mer size
-//   - levelMax: maximum sub-word size for entropy (typically 6)
-//
-// Returns:
-//   - minimum normalized entropy across all word sizes 1..levelMax
 func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	if k < 1 || levelMax < 1 {
 		return 1.0
@@ -35,7 +34,7 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 	var seqBuf [32]byte
 	seq := DecodeKmer(kmer, k, seqBuf[:])

-	// Pre-compute nLogN lookup (same as lowmask)
+	// Pre-compute nLogN lookup
 	nLogN := make([]float64, k+1)
 	for i := 1; i <= k; i++ {
 		nLogN[i] = float64(i) * math.Log(float64(i))
@@ -51,6 +50,23 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 		}
 	}

+	// Build ln(class_size) tables: for each canonical form, how many raw
+	// words map to it under circular normalization.
+	classLogSizeTables := make([][]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		tableSize := 1 << (ws * 2)
+		classSize := make([]int, tableSize)
+		for code := 0; code < tableSize; code++ {
+			classSize[normTables[ws][code]]++
+		}
+		classLogSizeTables[ws] = make([]float64, tableSize)
+		for j := 0; j < tableSize; j++ {
+			if classSize[j] > 0 {
+				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
+			}
+		}
+	}
+
 	minEntropy := math.MaxFloat64

 	for ws := 1; ws <= levelMax; ws++ {
@@ -75,23 +91,13 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 			table[normWord]++
 		}

-		// Compute Shannon entropy
+		// Compute emax over 4^ws raw bins (uncollapsed distribution).
 		floatNwords := float64(nwords)
 		logNwords := math.Log(floatNwords)
-
-		var sumNLogN float64
-		for j := 0; j < tableSize; j++ {
-			n := table[j]
-			if n > 0 {
-				sumNLogN += nLogN[n]
-			}
-		}
-
-		// Compute emax (maximum possible entropy for this word size)
-		na := CanonicalCircularKmerCount(ws)
+		na := tableSize // 4^ws
 		var emax float64
 		if nwords < na {
-			emax = math.Log(float64(nwords))
+			emax = logNwords
 		} else {
 			cov := nwords / na
 			remains := nwords - (na * cov)
@@ -105,7 +111,19 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
 			continue
 		}

-		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.
+		classLogSize := classLogSizeTables[ws]
+		var sumNLogN, sumClassLogN float64
+		for j := 0; j < tableSize; j++ {
+			n := table[j]
+			if n > 0 {
+				sumNLogN += nLogN[n]
+				sumClassLogN += float64(n) * classLogSize[j]
+			}
+		}
+
+		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
+		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}
@@ -134,6 +152,7 @@ type KmerEntropyFilter struct {
 	threshold           float64
 	nLogN               []float64
 	normTables          [][]int
+	classLogSizeTables  [][]float64
 	emaxValues          []float64
 	logNwords           []float64
 	// Pre-allocated frequency tables reused across Entropy() calls.
@@ -142,11 +161,6 @@ type KmerEntropyFilter struct {
 }

 // NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
-//
-// Parameters:
-//   - k: the k-mer size
-//   - levelMax: maximum sub-word size for entropy (typically 6)
-//   - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
 func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
 	if levelMax >= k {
 		levelMax = k - 1
@@ -169,20 +183,38 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
 		}
 	}

+	// ln(class_size) for each canonical form under circular normalization.
+	classLogSizeTables := make([][]float64, levelMax+1)
+	for ws := 1; ws <= levelMax; ws++ {
+		tableSize := 1 << (ws * 2)
+		classSize := make([]int, tableSize)
+		for code := 0; code < tableSize; code++ {
+			classSize[normTables[ws][code]]++
+		}
+		classLogSizeTables[ws] = make([]float64, tableSize)
+		for j := 0; j < tableSize; j++ {
+			if classSize[j] > 0 {
+				classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
+			}
+		}
+	}
+
+	// Pre-compute emax and logNwords per word size.
+	// emax uses 4^ws raw bins to match the corrected entropy.
 	emaxValues := make([]float64, levelMax+1)
 	logNwords := make([]float64, levelMax+1)
 	for ws := 1; ws <= levelMax; ws++ {
 		nw := k - ws + 1
-		na := CanonicalCircularKmerCount(ws)
+		na := 1 << (ws * 2) // 4^ws raw bins
+		floatNw := float64(nw)
+		logNwords[ws] = math.Log(floatNw)
 		if nw < na {
-			logNwords[ws] = math.Log(float64(nw))
-			emaxValues[ws] = math.Log(float64(nw))
+			emaxValues[ws] = logNwords[ws]
 		} else {
 			cov := nw / na
 			remains := nw - (na * cov)
-			f1 := float64(cov) / float64(nw)
-			f2 := float64(cov+1) / float64(nw)
-			logNwords[ws] = math.Log(float64(nw))
+			f1 := float64(cov) / floatNw
+			f2 := float64(cov+1) / floatNw
 			emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
 				float64(remains)*f2*math.Log(f2))
 		}
@@ -200,6 +232,7 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
 		threshold:          threshold,
 		nLogN:              nLogN,
 		normTables:         normTables,
+		classLogSizeTables: classLogSizeTables,
 		emaxValues:         emaxValues,
 		logNwords:          logNwords,
 		freqTables:         freqTables,
@@ -236,7 +269,7 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 		// Count circular-canonical sub-word frequencies
 		tableSize := 1 << (ws * 2)
 		table := ef.freqTables[ws]
-		clear(table) // reset to zero
+		clear(table)
 		mask := (1 << (ws * 2)) - 1
 		normTable := ef.normTables[ws]

@@ -251,19 +284,21 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
 			table[normWord]++
 		}

-		// Compute Shannon entropy
 		floatNwords := float64(nwords)
 		logNwords := ef.logNwords[ws]
+		classLogSize := ef.classLogSizeTables[ws]

-		var sumNLogN float64
+		var sumNLogN, sumClassLogN float64
 		for j := 0; j < tableSize; j++ {
 			n := table[j]
 			if n > 0 {
 				sumNLogN += ef.nLogN[n]
+				sumClassLogN += float64(n) * classLogSize[j]
 			}
 		}

-		entropy := (logNwords - sumNLogN/floatNwords) / emax
+		// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
+		entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
 		if entropy < 0 {
 			entropy = 0
 		}