mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-06-24 01:31:00 +00:00
af7ae3d60c
Multiple raw k-mers collapsing into identical circular canonical forms introduce bias into complexity estimates. This change pre-computes `log(class_size)` tables and per-word-size maximum entropy bounds. The `KmerEntropy` function and `KmerEntropyFilter` are updated to apply the corrected formula `(log(N) + Σf·log(s) - Σf·log(f))/N / emax`, ensuring accurate sequence complexity estimation.
317 lines
8.6 KiB
Go
317 lines
8.6 KiB
Go
package obikmer
|
|
|
|
import "math"
|
|
|
|
// KmerEntropy computes the entropy of a single encoded k-mer.
|
|
//
|
|
// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer
|
|
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
|
// normalizes them by circular canonical form, counts their frequencies, and
|
|
// computes Shannon entropy corrected for class sizes, normalized by the
|
|
// maximum possible entropy over 4^ws raw bins.
|
|
// The returned value is the minimum entropy across all word sizes.
|
|
//
|
|
// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N
|
|
// under-estimates the true complexity when many raw words collapse to the same
|
|
// canonical form. Adding Σ f·log(class_size)/N recovers the entropy of the
|
|
// underlying uncollapsed distribution (assuming uniform mixing within each
|
|
// equivalence class).
|
|
//
|
|
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
|
// while a value close to 1 indicates high complexity.
|
|
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|
if k < 1 || levelMax < 1 {
|
|
return 1.0
|
|
}
|
|
if levelMax >= k {
|
|
levelMax = k - 1
|
|
}
|
|
if levelMax < 1 {
|
|
return 1.0
|
|
}
|
|
|
|
// Decode k-mer to DNA sequence
|
|
var seqBuf [32]byte
|
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
|
|
|
// Pre-compute nLogN lookup
|
|
nLogN := make([]float64, k+1)
|
|
for i := 1; i <= k; i++ {
|
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
|
}
|
|
|
|
// Build circular-canonical normalization tables per word size
|
|
normTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
size := 1 << (ws * 2)
|
|
normTables[ws] = make([]int, size)
|
|
for code := 0; code < size; code++ {
|
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
|
}
|
|
}
|
|
|
|
// Build ln(class_size) tables: for each canonical form, how many raw
|
|
// words map to it under circular normalization.
|
|
classLogSizeTables := make([][]float64, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
tableSize := 1 << (ws * 2)
|
|
classSize := make([]int, tableSize)
|
|
for code := 0; code < tableSize; code++ {
|
|
classSize[normTables[ws][code]]++
|
|
}
|
|
classLogSizeTables[ws] = make([]float64, tableSize)
|
|
for j := 0; j < tableSize; j++ {
|
|
if classSize[j] > 0 {
|
|
classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
|
|
}
|
|
}
|
|
}
|
|
|
|
minEntropy := math.MaxFloat64
|
|
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
nwords := k - ws + 1
|
|
if nwords < 1 {
|
|
continue
|
|
}
|
|
|
|
// Count circular-canonical sub-word frequencies
|
|
tableSize := 1 << (ws * 2)
|
|
table := make([]int, tableSize)
|
|
mask := (1 << (ws * 2)) - 1
|
|
|
|
wordIndex := 0
|
|
for i := 0; i < ws-1; i++ {
|
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
|
}
|
|
|
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
|
normWord := normTables[ws][wordIndex]
|
|
table[normWord]++
|
|
}
|
|
|
|
// Compute emax over 4^ws raw bins (uncollapsed distribution).
|
|
floatNwords := float64(nwords)
|
|
logNwords := math.Log(floatNwords)
|
|
na := tableSize // 4^ws
|
|
var emax float64
|
|
if nwords < na {
|
|
emax = logNwords
|
|
} else {
|
|
cov := nwords / na
|
|
remains := nwords - (na * cov)
|
|
f1 := float64(cov) / floatNwords
|
|
f2 := float64(cov+1) / floatNwords
|
|
emax = -(float64(na-remains)*f1*math.Log(f1) +
|
|
float64(remains)*f2*math.Log(f2))
|
|
}
|
|
|
|
if emax <= 0 {
|
|
continue
|
|
}
|
|
|
|
// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.
|
|
classLogSize := classLogSizeTables[ws]
|
|
var sumNLogN, sumClassLogN float64
|
|
for j := 0; j < tableSize; j++ {
|
|
n := table[j]
|
|
if n > 0 {
|
|
sumNLogN += nLogN[n]
|
|
sumClassLogN += float64(n) * classLogSize[j]
|
|
}
|
|
}
|
|
|
|
// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
|
|
entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
|
|
if entropy < 0 {
|
|
entropy = 0
|
|
}
|
|
|
|
if entropy < minEntropy {
|
|
minEntropy = entropy
|
|
}
|
|
}
|
|
|
|
if minEntropy == math.MaxFloat64 {
|
|
return 1.0
|
|
}
|
|
|
|
return math.Round(minEntropy*10000) / 10000
|
|
}
|
|
|
|
// KmerEntropyFilter is a reusable entropy filter for batch processing.
|
|
// It pre-computes normalization tables and lookup values to avoid repeated
|
|
// allocation across millions of k-mers.
|
|
//
|
|
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
|
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
|
type KmerEntropyFilter struct {
|
|
k int
|
|
levelMax int
|
|
threshold float64
|
|
nLogN []float64
|
|
normTables [][]int
|
|
classLogSizeTables [][]float64
|
|
emaxValues []float64
|
|
logNwords []float64
|
|
// Pre-allocated frequency tables reused across Entropy() calls.
|
|
// One per word size (index 0 unused). Reset to zero before each use.
|
|
freqTables [][]int
|
|
}
|
|
|
|
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
|
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
|
if levelMax >= k {
|
|
levelMax = k - 1
|
|
}
|
|
if levelMax < 1 {
|
|
levelMax = 1
|
|
}
|
|
|
|
nLogN := make([]float64, k+1)
|
|
for i := 1; i <= k; i++ {
|
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
|
}
|
|
|
|
normTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
size := 1 << (ws * 2)
|
|
normTables[ws] = make([]int, size)
|
|
for code := 0; code < size; code++ {
|
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
|
}
|
|
}
|
|
|
|
// ln(class_size) for each canonical form under circular normalization.
|
|
classLogSizeTables := make([][]float64, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
tableSize := 1 << (ws * 2)
|
|
classSize := make([]int, tableSize)
|
|
for code := 0; code < tableSize; code++ {
|
|
classSize[normTables[ws][code]]++
|
|
}
|
|
classLogSizeTables[ws] = make([]float64, tableSize)
|
|
for j := 0; j < tableSize; j++ {
|
|
if classSize[j] > 0 {
|
|
classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pre-compute emax and logNwords per word size.
|
|
// emax uses 4^ws raw bins to match the corrected entropy.
|
|
emaxValues := make([]float64, levelMax+1)
|
|
logNwords := make([]float64, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
nw := k - ws + 1
|
|
na := 1 << (ws * 2) // 4^ws raw bins
|
|
floatNw := float64(nw)
|
|
logNwords[ws] = math.Log(floatNw)
|
|
if nw < na {
|
|
emaxValues[ws] = logNwords[ws]
|
|
} else {
|
|
cov := nw / na
|
|
remains := nw - (na * cov)
|
|
f1 := float64(cov) / floatNw
|
|
f2 := float64(cov+1) / floatNw
|
|
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
|
float64(remains)*f2*math.Log(f2))
|
|
}
|
|
}
|
|
|
|
// Pre-allocate frequency tables per word size
|
|
freqTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
freqTables[ws] = make([]int, 1<<(ws*2))
|
|
}
|
|
|
|
return &KmerEntropyFilter{
|
|
k: k,
|
|
levelMax: levelMax,
|
|
threshold: threshold,
|
|
nLogN: nLogN,
|
|
normTables: normTables,
|
|
classLogSizeTables: classLogSizeTables,
|
|
emaxValues: emaxValues,
|
|
logNwords: logNwords,
|
|
freqTables: freqTables,
|
|
}
|
|
}
|
|
|
|
// Accept returns true if the k-mer has entropy strictly above the threshold.
|
|
// Low-complexity k-mers (entropy <= threshold) are rejected.
|
|
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
|
|
return ef.Entropy(kmer) > ef.threshold
|
|
}
|
|
|
|
// Entropy computes the entropy for a single k-mer using pre-computed tables.
|
|
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
|
k := ef.k
|
|
|
|
// Decode k-mer to DNA sequence
|
|
var seqBuf [32]byte
|
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
|
|
|
minEntropy := math.MaxFloat64
|
|
|
|
for ws := 1; ws <= ef.levelMax; ws++ {
|
|
nwords := k - ws + 1
|
|
if nwords < 1 {
|
|
continue
|
|
}
|
|
|
|
emax := ef.emaxValues[ws]
|
|
if emax <= 0 {
|
|
continue
|
|
}
|
|
|
|
// Count circular-canonical sub-word frequencies
|
|
tableSize := 1 << (ws * 2)
|
|
table := ef.freqTables[ws]
|
|
clear(table)
|
|
mask := (1 << (ws * 2)) - 1
|
|
normTable := ef.normTables[ws]
|
|
|
|
wordIndex := 0
|
|
for i := 0; i < ws-1; i++ {
|
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
|
}
|
|
|
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
|
normWord := normTable[wordIndex]
|
|
table[normWord]++
|
|
}
|
|
|
|
floatNwords := float64(nwords)
|
|
logNwords := ef.logNwords[ws]
|
|
classLogSize := ef.classLogSizeTables[ws]
|
|
|
|
var sumNLogN, sumClassLogN float64
|
|
for j := 0; j < tableSize; j++ {
|
|
n := table[j]
|
|
if n > 0 {
|
|
sumNLogN += ef.nLogN[n]
|
|
sumClassLogN += float64(n) * classLogSize[j]
|
|
}
|
|
}
|
|
|
|
// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
|
|
entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
|
|
if entropy < 0 {
|
|
entropy = 0
|
|
}
|
|
|
|
if entropy < minEntropy {
|
|
minEntropy = entropy
|
|
}
|
|
}
|
|
|
|
if minEntropy == math.MaxFloat64 {
|
|
return 1.0
|
|
}
|
|
|
|
return math.Round(minEntropy*10000) / 10000
|
|
}
|