mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds: - New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy - Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go) - A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices - CLI options for configuring entropy filtering during index building and filtering The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
282 lines
7.0 KiB
Go
282 lines
7.0 KiB
Go
package obikmer
|
|
|
|
import "math"
|
|
|
|
// KmerEntropy computes the entropy of a single encoded k-mer.
|
|
//
|
|
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
|
|
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
|
// normalizes them by circular canonical form, counts their frequencies, and
|
|
// computes Shannon entropy normalized by the maximum possible entropy.
|
|
// The returned value is the minimum entropy across all word sizes.
|
|
//
|
|
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
|
// while a value close to 1 indicates high complexity.
|
|
//
|
|
// Parameters:
|
|
// - kmer: the encoded k-mer (2 bits per base)
|
|
// - k: the k-mer size
|
|
// - levelMax: maximum sub-word size for entropy (typically 6)
|
|
//
|
|
// Returns:
|
|
// - minimum normalized entropy across all word sizes 1..levelMax
|
|
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|
if k < 1 || levelMax < 1 {
|
|
return 1.0
|
|
}
|
|
if levelMax >= k {
|
|
levelMax = k - 1
|
|
}
|
|
if levelMax < 1 {
|
|
return 1.0
|
|
}
|
|
|
|
// Decode k-mer to DNA sequence
|
|
var seqBuf [32]byte
|
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
|
|
|
// Pre-compute nLogN lookup (same as lowmask)
|
|
nLogN := make([]float64, k+1)
|
|
for i := 1; i <= k; i++ {
|
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
|
}
|
|
|
|
// Build circular-canonical normalization tables per word size
|
|
normTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
size := 1 << (ws * 2)
|
|
normTables[ws] = make([]int, size)
|
|
for code := 0; code < size; code++ {
|
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
|
}
|
|
}
|
|
|
|
minEntropy := math.MaxFloat64
|
|
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
nwords := k - ws + 1
|
|
if nwords < 1 {
|
|
continue
|
|
}
|
|
|
|
// Count circular-canonical sub-word frequencies
|
|
tableSize := 1 << (ws * 2)
|
|
table := make([]int, tableSize)
|
|
mask := (1 << (ws * 2)) - 1
|
|
|
|
wordIndex := 0
|
|
for i := 0; i < ws-1; i++ {
|
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
|
}
|
|
|
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
|
normWord := normTables[ws][wordIndex]
|
|
table[normWord]++
|
|
}
|
|
|
|
// Compute Shannon entropy
|
|
floatNwords := float64(nwords)
|
|
logNwords := math.Log(floatNwords)
|
|
|
|
var sumNLogN float64
|
|
for j := 0; j < tableSize; j++ {
|
|
n := table[j]
|
|
if n > 0 {
|
|
sumNLogN += nLogN[n]
|
|
}
|
|
}
|
|
|
|
// Compute emax (maximum possible entropy for this word size)
|
|
na := CanonicalCircularKmerCount(ws)
|
|
var emax float64
|
|
if nwords < na {
|
|
emax = math.Log(float64(nwords))
|
|
} else {
|
|
cov := nwords / na
|
|
remains := nwords - (na * cov)
|
|
f1 := float64(cov) / floatNwords
|
|
f2 := float64(cov+1) / floatNwords
|
|
emax = -(float64(na-remains)*f1*math.Log(f1) +
|
|
float64(remains)*f2*math.Log(f2))
|
|
}
|
|
|
|
if emax <= 0 {
|
|
continue
|
|
}
|
|
|
|
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
|
if entropy < 0 {
|
|
entropy = 0
|
|
}
|
|
|
|
if entropy < minEntropy {
|
|
minEntropy = entropy
|
|
}
|
|
}
|
|
|
|
if minEntropy == math.MaxFloat64 {
|
|
return 1.0
|
|
}
|
|
|
|
return math.Round(minEntropy*10000) / 10000
|
|
}
|
|
|
|
// KmerEntropyFilter is a reusable entropy filter for batch processing.
|
|
// It pre-computes normalization tables and lookup values to avoid repeated
|
|
// allocation across millions of k-mers.
|
|
//
|
|
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
|
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
|
type KmerEntropyFilter struct {
|
|
k int
|
|
levelMax int
|
|
threshold float64
|
|
nLogN []float64
|
|
normTables [][]int
|
|
emaxValues []float64
|
|
logNwords []float64
|
|
// Pre-allocated frequency tables reused across Entropy() calls.
|
|
// One per word size (index 0 unused). Reset to zero before each use.
|
|
freqTables [][]int
|
|
}
|
|
|
|
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
|
//
|
|
// Parameters:
|
|
// - k: the k-mer size
|
|
// - levelMax: maximum sub-word size for entropy (typically 6)
|
|
// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
|
|
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
|
if levelMax >= k {
|
|
levelMax = k - 1
|
|
}
|
|
if levelMax < 1 {
|
|
levelMax = 1
|
|
}
|
|
|
|
nLogN := make([]float64, k+1)
|
|
for i := 1; i <= k; i++ {
|
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
|
}
|
|
|
|
normTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
size := 1 << (ws * 2)
|
|
normTables[ws] = make([]int, size)
|
|
for code := 0; code < size; code++ {
|
|
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
|
}
|
|
}
|
|
|
|
emaxValues := make([]float64, levelMax+1)
|
|
logNwords := make([]float64, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
nw := k - ws + 1
|
|
na := CanonicalCircularKmerCount(ws)
|
|
if nw < na {
|
|
logNwords[ws] = math.Log(float64(nw))
|
|
emaxValues[ws] = math.Log(float64(nw))
|
|
} else {
|
|
cov := nw / na
|
|
remains := nw - (na * cov)
|
|
f1 := float64(cov) / float64(nw)
|
|
f2 := float64(cov+1) / float64(nw)
|
|
logNwords[ws] = math.Log(float64(nw))
|
|
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
|
float64(remains)*f2*math.Log(f2))
|
|
}
|
|
}
|
|
|
|
// Pre-allocate frequency tables per word size
|
|
freqTables := make([][]int, levelMax+1)
|
|
for ws := 1; ws <= levelMax; ws++ {
|
|
freqTables[ws] = make([]int, 1<<(ws*2))
|
|
}
|
|
|
|
return &KmerEntropyFilter{
|
|
k: k,
|
|
levelMax: levelMax,
|
|
threshold: threshold,
|
|
nLogN: nLogN,
|
|
normTables: normTables,
|
|
emaxValues: emaxValues,
|
|
logNwords: logNwords,
|
|
freqTables: freqTables,
|
|
}
|
|
}
|
|
|
|
// Accept returns true if the k-mer has entropy strictly above the threshold.
|
|
// Low-complexity k-mers (entropy <= threshold) are rejected.
|
|
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
|
|
return ef.Entropy(kmer) > ef.threshold
|
|
}
|
|
|
|
// Entropy computes the entropy for a single k-mer using pre-computed tables.
|
|
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
|
k := ef.k
|
|
|
|
// Decode k-mer to DNA sequence
|
|
var seqBuf [32]byte
|
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
|
|
|
minEntropy := math.MaxFloat64
|
|
|
|
for ws := 1; ws <= ef.levelMax; ws++ {
|
|
nwords := k - ws + 1
|
|
if nwords < 1 {
|
|
continue
|
|
}
|
|
|
|
emax := ef.emaxValues[ws]
|
|
if emax <= 0 {
|
|
continue
|
|
}
|
|
|
|
// Count circular-canonical sub-word frequencies
|
|
tableSize := 1 << (ws * 2)
|
|
table := ef.freqTables[ws]
|
|
clear(table) // reset to zero
|
|
mask := (1 << (ws * 2)) - 1
|
|
normTable := ef.normTables[ws]
|
|
|
|
wordIndex := 0
|
|
for i := 0; i < ws-1; i++ {
|
|
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
|
}
|
|
|
|
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
|
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
|
normWord := normTable[wordIndex]
|
|
table[normWord]++
|
|
}
|
|
|
|
// Compute Shannon entropy
|
|
floatNwords := float64(nwords)
|
|
logNwords := ef.logNwords[ws]
|
|
|
|
var sumNLogN float64
|
|
for j := 0; j < tableSize; j++ {
|
|
n := table[j]
|
|
if n > 0 {
|
|
sumNLogN += ef.nLogN[n]
|
|
}
|
|
}
|
|
|
|
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
|
if entropy < 0 {
|
|
entropy = 0
|
|
}
|
|
|
|
if entropy < minEntropy {
|
|
minEntropy = entropy
|
|
}
|
|
}
|
|
|
|
if minEntropy == math.MaxFloat64 {
|
|
return 1.0
|
|
}
|
|
|
|
return math.Round(minEntropy*10000) / 10000
|
|
}
|