package obikmer import "math" // KmerEntropy computes the entropy of a single encoded k-mer. // // The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer // to a DNA sequence, extracts all sub-words of each size from 1 to levelMax, // normalizes them by circular canonical form, counts their frequencies, and // computes Shannon entropy corrected for class sizes, normalized by the // maximum possible entropy over 4^ws raw bins. // The returned value is the minimum entropy across all word sizes. // // Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N // under-estimates the true complexity when many raw words collapse to the same // canonical form. Adding Σ f·log(class_size)/N recovers the entropy of the // underlying uncollapsed distribution (assuming uniform mixing within each // equivalence class). // // A value close to 0 indicates very low complexity (e.g. "AAAA..."), // while a value close to 1 indicates high complexity. func KmerEntropy(kmer uint64, k int, levelMax int) float64 { if k < 1 || levelMax < 1 { return 1.0 } if levelMax >= k { levelMax = k - 1 } if levelMax < 1 { return 1.0 } // Decode k-mer to DNA sequence var seqBuf [32]byte seq := DecodeKmer(kmer, k, seqBuf[:]) // Pre-compute nLogN lookup nLogN := make([]float64, k+1) for i := 1; i <= k; i++ { nLogN[i] = float64(i) * math.Log(float64(i)) } // Build circular-canonical normalization tables per word size normTables := make([][]int, levelMax+1) for ws := 1; ws <= levelMax; ws++ { size := 1 << (ws * 2) normTables[ws] = make([]int, size) for code := 0; code < size; code++ { normTables[ws][code] = int(NormalizeCircular(uint64(code), ws)) } } // Build ln(class_size) tables: for each canonical form, how many raw // words map to it under circular normalization. classLogSizeTables := make([][]float64, levelMax+1) for ws := 1; ws <= levelMax; ws++ { tableSize := 1 << (ws * 2) classSize := make([]int, tableSize) for code := 0; code < tableSize; code++ { classSize[normTables[ws][code]]++ } classLogSizeTables[ws] = make([]float64, tableSize) for j := 0; j < tableSize; j++ { if classSize[j] > 0 { classLogSizeTables[ws][j] = math.Log(float64(classSize[j])) } } } minEntropy := math.MaxFloat64 for ws := 1; ws <= levelMax; ws++ { nwords := k - ws + 1 if nwords < 1 { continue } // Count circular-canonical sub-word frequencies tableSize := 1 << (ws * 2) table := make([]int, tableSize) mask := (1 << (ws * 2)) - 1 wordIndex := 0 for i := 0; i < ws-1; i++ { wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i])) } for i, j := 0, ws-1; j < k; i, j = i+1, j+1 { wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j])) normWord := normTables[ws][wordIndex] table[normWord]++ } // Compute emax over 4^ws raw bins (uncollapsed distribution). floatNwords := float64(nwords) logNwords := math.Log(floatNwords) na := tableSize // 4^ws var emax float64 if nwords < na { emax = logNwords } else { cov := nwords / na remains := nwords - (na * cov) f1 := float64(cov) / floatNwords f2 := float64(cov+1) / floatNwords emax = -(float64(na-remains)*f1*math.Log(f1) + float64(remains)*f2*math.Log(f2)) } if emax <= 0 { continue } // Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms. classLogSize := classLogSizeTables[ws] var sumNLogN, sumClassLogN float64 for j := 0; j < tableSize; j++ { n := table[j] if n > 0 { sumNLogN += nLogN[n] sumClassLogN += float64(n) * classLogSize[j] } } // Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax if entropy < 0 { entropy = 0 } if entropy < minEntropy { minEntropy = entropy } } if minEntropy == math.MaxFloat64 { return 1.0 } return math.Round(minEntropy*10000) / 10000 } // KmerEntropyFilter is a reusable entropy filter for batch processing. // It pre-computes normalization tables and lookup values to avoid repeated // allocation across millions of k-mers. // // IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use. // Each goroutine must create its own instance via NewKmerEntropyFilter. type KmerEntropyFilter struct { k int levelMax int threshold float64 nLogN []float64 normTables [][]int classLogSizeTables [][]float64 emaxValues []float64 logNwords []float64 // Pre-allocated frequency tables reused across Entropy() calls. // One per word size (index 0 unused). Reset to zero before each use. freqTables [][]int } // NewKmerEntropyFilter creates an entropy filter with pre-computed tables. func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter { if levelMax >= k { levelMax = k - 1 } if levelMax < 1 { levelMax = 1 } nLogN := make([]float64, k+1) for i := 1; i <= k; i++ { nLogN[i] = float64(i) * math.Log(float64(i)) } normTables := make([][]int, levelMax+1) for ws := 1; ws <= levelMax; ws++ { size := 1 << (ws * 2) normTables[ws] = make([]int, size) for code := 0; code < size; code++ { normTables[ws][code] = int(NormalizeCircular(uint64(code), ws)) } } // ln(class_size) for each canonical form under circular normalization. classLogSizeTables := make([][]float64, levelMax+1) for ws := 1; ws <= levelMax; ws++ { tableSize := 1 << (ws * 2) classSize := make([]int, tableSize) for code := 0; code < tableSize; code++ { classSize[normTables[ws][code]]++ } classLogSizeTables[ws] = make([]float64, tableSize) for j := 0; j < tableSize; j++ { if classSize[j] > 0 { classLogSizeTables[ws][j] = math.Log(float64(classSize[j])) } } } // Pre-compute emax and logNwords per word size. // emax uses 4^ws raw bins to match the corrected entropy. emaxValues := make([]float64, levelMax+1) logNwords := make([]float64, levelMax+1) for ws := 1; ws <= levelMax; ws++ { nw := k - ws + 1 na := 1 << (ws * 2) // 4^ws raw bins floatNw := float64(nw) logNwords[ws] = math.Log(floatNw) if nw < na { emaxValues[ws] = logNwords[ws] } else { cov := nw / na remains := nw - (na * cov) f1 := float64(cov) / floatNw f2 := float64(cov+1) / floatNw emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) + float64(remains)*f2*math.Log(f2)) } } // Pre-allocate frequency tables per word size freqTables := make([][]int, levelMax+1) for ws := 1; ws <= levelMax; ws++ { freqTables[ws] = make([]int, 1<<(ws*2)) } return &KmerEntropyFilter{ k: k, levelMax: levelMax, threshold: threshold, nLogN: nLogN, normTables: normTables, classLogSizeTables: classLogSizeTables, emaxValues: emaxValues, logNwords: logNwords, freqTables: freqTables, } } // Accept returns true if the k-mer has entropy strictly above the threshold. // Low-complexity k-mers (entropy <= threshold) are rejected. func (ef *KmerEntropyFilter) Accept(kmer uint64) bool { return ef.Entropy(kmer) > ef.threshold } // Entropy computes the entropy for a single k-mer using pre-computed tables. func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 { k := ef.k // Decode k-mer to DNA sequence var seqBuf [32]byte seq := DecodeKmer(kmer, k, seqBuf[:]) minEntropy := math.MaxFloat64 for ws := 1; ws <= ef.levelMax; ws++ { nwords := k - ws + 1 if nwords < 1 { continue } emax := ef.emaxValues[ws] if emax <= 0 { continue } // Count circular-canonical sub-word frequencies tableSize := 1 << (ws * 2) table := ef.freqTables[ws] clear(table) mask := (1 << (ws * 2)) - 1 normTable := ef.normTables[ws] wordIndex := 0 for i := 0; i < ws-1; i++ { wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i])) } for i, j := 0, ws-1; j < k; i, j = i+1, j+1 { wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j])) normWord := normTable[wordIndex] table[normWord]++ } floatNwords := float64(nwords) logNwords := ef.logNwords[ws] classLogSize := ef.classLogSizeTables[ws] var sumNLogN, sumClassLogN float64 for j := 0; j < tableSize; j++ { n := table[j] if n > 0 { sumNLogN += ef.nLogN[n] sumClassLogN += float64(n) * classLogSize[j] } } // Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax if entropy < 0 { entropy = 0 } if entropy < minEntropy { minEntropy = entropy } } if minEntropy == math.MaxFloat64 { return 1.0 } return math.Round(minEntropy*10000) / 10000 }