mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add entropy-based filtering for k-mers
This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds: - New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy - Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go) - A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices - CLI options for configuring entropy filtering during index building and filtering The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
This commit is contained in:
281
pkg/obikmer/entropy.go
Normal file
281
pkg/obikmer/entropy.go
Normal file
@@ -0,0 +1,281 @@
|
||||
package obikmer
|
||||
|
||||
import "math"
|
||||
|
||||
// KmerEntropy computes the entropy of a single encoded k-mer.
|
||||
//
|
||||
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
|
||||
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
||||
// normalizes them by circular canonical form, counts their frequencies, and
|
||||
// computes Shannon entropy normalized by the maximum possible entropy.
|
||||
// The returned value is the minimum entropy across all word sizes.
|
||||
//
|
||||
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
||||
// while a value close to 1 indicates high complexity.
|
||||
//
|
||||
// Parameters:
|
||||
// - kmer: the encoded k-mer (2 bits per base)
|
||||
// - k: the k-mer size
|
||||
// - levelMax: maximum sub-word size for entropy (typically 6)
|
||||
//
|
||||
// Returns:
|
||||
// - minimum normalized entropy across all word sizes 1..levelMax
|
||||
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
||||
if k < 1 || levelMax < 1 {
|
||||
return 1.0
|
||||
}
|
||||
if levelMax >= k {
|
||||
levelMax = k - 1
|
||||
}
|
||||
if levelMax < 1 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// Decode k-mer to DNA sequence
|
||||
var seqBuf [32]byte
|
||||
seq := DecodeKmer(kmer, k, seqBuf[:])
|
||||
|
||||
// Pre-compute nLogN lookup (same as lowmask)
|
||||
nLogN := make([]float64, k+1)
|
||||
for i := 1; i <= k; i++ {
|
||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||
}
|
||||
|
||||
// Build circular-canonical normalization tables per word size
|
||||
normTables := make([][]int, levelMax+1)
|
||||
for ws := 1; ws <= levelMax; ws++ {
|
||||
size := 1 << (ws * 2)
|
||||
normTables[ws] = make([]int, size)
|
||||
for code := 0; code < size; code++ {
|
||||
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
||||
}
|
||||
}
|
||||
|
||||
minEntropy := math.MaxFloat64
|
||||
|
||||
for ws := 1; ws <= levelMax; ws++ {
|
||||
nwords := k - ws + 1
|
||||
if nwords < 1 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Count circular-canonical sub-word frequencies
|
||||
tableSize := 1 << (ws * 2)
|
||||
table := make([]int, tableSize)
|
||||
mask := (1 << (ws * 2)) - 1
|
||||
|
||||
wordIndex := 0
|
||||
for i := 0; i < ws-1; i++ {
|
||||
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
||||
}
|
||||
|
||||
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
||||
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
||||
normWord := normTables[ws][wordIndex]
|
||||
table[normWord]++
|
||||
}
|
||||
|
||||
// Compute Shannon entropy
|
||||
floatNwords := float64(nwords)
|
||||
logNwords := math.Log(floatNwords)
|
||||
|
||||
var sumNLogN float64
|
||||
for j := 0; j < tableSize; j++ {
|
||||
n := table[j]
|
||||
if n > 0 {
|
||||
sumNLogN += nLogN[n]
|
||||
}
|
||||
}
|
||||
|
||||
// Compute emax (maximum possible entropy for this word size)
|
||||
na := CanonicalCircularKmerCount(ws)
|
||||
var emax float64
|
||||
if nwords < na {
|
||||
emax = math.Log(float64(nwords))
|
||||
} else {
|
||||
cov := nwords / na
|
||||
remains := nwords - (na * cov)
|
||||
f1 := float64(cov) / floatNwords
|
||||
f2 := float64(cov+1) / floatNwords
|
||||
emax = -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
}
|
||||
|
||||
if emax <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
||||
if entropy < 0 {
|
||||
entropy = 0
|
||||
}
|
||||
|
||||
if entropy < minEntropy {
|
||||
minEntropy = entropy
|
||||
}
|
||||
}
|
||||
|
||||
if minEntropy == math.MaxFloat64 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
return math.Round(minEntropy*10000) / 10000
|
||||
}
|
||||
|
||||
// KmerEntropyFilter is a reusable entropy filter for batch processing.
|
||||
// It pre-computes normalization tables and lookup values to avoid repeated
|
||||
// allocation across millions of k-mers.
|
||||
//
|
||||
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
||||
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
||||
type KmerEntropyFilter struct {
|
||||
k int
|
||||
levelMax int
|
||||
threshold float64
|
||||
nLogN []float64
|
||||
normTables [][]int
|
||||
emaxValues []float64
|
||||
logNwords []float64
|
||||
// Pre-allocated frequency tables reused across Entropy() calls.
|
||||
// One per word size (index 0 unused). Reset to zero before each use.
|
||||
freqTables [][]int
|
||||
}
|
||||
|
||||
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
||||
//
|
||||
// Parameters:
|
||||
// - k: the k-mer size
|
||||
// - levelMax: maximum sub-word size for entropy (typically 6)
|
||||
// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
|
||||
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
||||
if levelMax >= k {
|
||||
levelMax = k - 1
|
||||
}
|
||||
if levelMax < 1 {
|
||||
levelMax = 1
|
||||
}
|
||||
|
||||
nLogN := make([]float64, k+1)
|
||||
for i := 1; i <= k; i++ {
|
||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||
}
|
||||
|
||||
normTables := make([][]int, levelMax+1)
|
||||
for ws := 1; ws <= levelMax; ws++ {
|
||||
size := 1 << (ws * 2)
|
||||
normTables[ws] = make([]int, size)
|
||||
for code := 0; code < size; code++ {
|
||||
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
|
||||
}
|
||||
}
|
||||
|
||||
emaxValues := make([]float64, levelMax+1)
|
||||
logNwords := make([]float64, levelMax+1)
|
||||
for ws := 1; ws <= levelMax; ws++ {
|
||||
nw := k - ws + 1
|
||||
na := CanonicalCircularKmerCount(ws)
|
||||
if nw < na {
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = math.Log(float64(nw))
|
||||
} else {
|
||||
cov := nw / na
|
||||
remains := nw - (na * cov)
|
||||
f1 := float64(cov) / float64(nw)
|
||||
f2 := float64(cov+1) / float64(nw)
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-allocate frequency tables per word size
|
||||
freqTables := make([][]int, levelMax+1)
|
||||
for ws := 1; ws <= levelMax; ws++ {
|
||||
freqTables[ws] = make([]int, 1<<(ws*2))
|
||||
}
|
||||
|
||||
return &KmerEntropyFilter{
|
||||
k: k,
|
||||
levelMax: levelMax,
|
||||
threshold: threshold,
|
||||
nLogN: nLogN,
|
||||
normTables: normTables,
|
||||
emaxValues: emaxValues,
|
||||
logNwords: logNwords,
|
||||
freqTables: freqTables,
|
||||
}
|
||||
}
|
||||
|
||||
// Accept returns true if the k-mer has entropy strictly above the threshold.
|
||||
// Low-complexity k-mers (entropy <= threshold) are rejected.
|
||||
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
|
||||
return ef.Entropy(kmer) > ef.threshold
|
||||
}
|
||||
|
||||
// Entropy computes the entropy for a single k-mer using pre-computed tables.
|
||||
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
||||
k := ef.k
|
||||
|
||||
// Decode k-mer to DNA sequence
|
||||
var seqBuf [32]byte
|
||||
seq := DecodeKmer(kmer, k, seqBuf[:])
|
||||
|
||||
minEntropy := math.MaxFloat64
|
||||
|
||||
for ws := 1; ws <= ef.levelMax; ws++ {
|
||||
nwords := k - ws + 1
|
||||
if nwords < 1 {
|
||||
continue
|
||||
}
|
||||
|
||||
emax := ef.emaxValues[ws]
|
||||
if emax <= 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Count circular-canonical sub-word frequencies
|
||||
tableSize := 1 << (ws * 2)
|
||||
table := ef.freqTables[ws]
|
||||
clear(table) // reset to zero
|
||||
mask := (1 << (ws * 2)) - 1
|
||||
normTable := ef.normTables[ws]
|
||||
|
||||
wordIndex := 0
|
||||
for i := 0; i < ws-1; i++ {
|
||||
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
|
||||
}
|
||||
|
||||
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
|
||||
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
|
||||
normWord := normTable[wordIndex]
|
||||
table[normWord]++
|
||||
}
|
||||
|
||||
// Compute Shannon entropy
|
||||
floatNwords := float64(nwords)
|
||||
logNwords := ef.logNwords[ws]
|
||||
|
||||
var sumNLogN float64
|
||||
for j := 0; j < tableSize; j++ {
|
||||
n := table[j]
|
||||
if n > 0 {
|
||||
sumNLogN += ef.nLogN[n]
|
||||
}
|
||||
}
|
||||
|
||||
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
||||
if entropy < 0 {
|
||||
entropy = 0
|
||||
}
|
||||
|
||||
if entropy < minEntropy {
|
||||
minEntropy = entropy
|
||||
}
|
||||
}
|
||||
|
||||
if minEntropy == math.MaxFloat64 {
|
||||
return 1.0
|
||||
}
|
||||
|
||||
return math.Round(minEntropy*10000) / 10000
|
||||
}
|
||||
Reference in New Issue
Block a user