Add entropy-based filtering for k-mers

This commit introduces entropy-based filtering for k-mers to remove low-complexity sequences. It adds:

- New KmerEntropy and KmerEntropyFilter functions in pkg/obikmer/entropy.go for computing and filtering k-mer entropy
- Integration of entropy filtering in the k-mer set builder (pkg/obikmer/kmer_set_builder.go)
- A new 'filter' command in obik tool (pkg/obitools/obik/filter.go) to apply entropy filtering on existing indices
- CLI options for configuring entropy filtering during index building and filtering

The entropy filter helps improve the quality of k-mer sets by removing repetitive sequences that may interfere with downstream analyses.
This commit is contained in:
Eric Coissac
2026-02-10 18:19:57 +01:00
parent c6e04265f1
commit bebbbbfe7d
7 changed files with 910 additions and 60 deletions

281
pkg/obikmer/entropy.go Normal file
View File

@@ -0,0 +1,281 @@
package obikmer
import "math"
// KmerEntropy computes the entropy of a single encoded k-mer.
//
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
// normalizes them by circular canonical form, counts their frequencies, and
// computes Shannon entropy normalized by the maximum possible entropy.
// The returned value is the minimum entropy across all word sizes.
//
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
// while a value close to 1 indicates high complexity.
//
// Parameters:
// - kmer: the encoded k-mer (2 bits per base)
// - k: the k-mer size
// - levelMax: maximum sub-word size for entropy (typically 6)
//
// Returns:
// - minimum normalized entropy across all word sizes 1..levelMax
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
if k < 1 || levelMax < 1 {
return 1.0
}
if levelMax >= k {
levelMax = k - 1
}
if levelMax < 1 {
return 1.0
}
// Decode k-mer to DNA sequence
var seqBuf [32]byte
seq := DecodeKmer(kmer, k, seqBuf[:])
// Pre-compute nLogN lookup (same as lowmask)
nLogN := make([]float64, k+1)
for i := 1; i <= k; i++ {
nLogN[i] = float64(i) * math.Log(float64(i))
}
// Build circular-canonical normalization tables per word size
normTables := make([][]int, levelMax+1)
for ws := 1; ws <= levelMax; ws++ {
size := 1 << (ws * 2)
normTables[ws] = make([]int, size)
for code := 0; code < size; code++ {
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
}
}
minEntropy := math.MaxFloat64
for ws := 1; ws <= levelMax; ws++ {
nwords := k - ws + 1
if nwords < 1 {
continue
}
// Count circular-canonical sub-word frequencies
tableSize := 1 << (ws * 2)
table := make([]int, tableSize)
mask := (1 << (ws * 2)) - 1
wordIndex := 0
for i := 0; i < ws-1; i++ {
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
}
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
normWord := normTables[ws][wordIndex]
table[normWord]++
}
// Compute Shannon entropy
floatNwords := float64(nwords)
logNwords := math.Log(floatNwords)
var sumNLogN float64
for j := 0; j < tableSize; j++ {
n := table[j]
if n > 0 {
sumNLogN += nLogN[n]
}
}
// Compute emax (maximum possible entropy for this word size)
na := CanonicalCircularKmerCount(ws)
var emax float64
if nwords < na {
emax = math.Log(float64(nwords))
} else {
cov := nwords / na
remains := nwords - (na * cov)
f1 := float64(cov) / floatNwords
f2 := float64(cov+1) / floatNwords
emax = -(float64(na-remains)*f1*math.Log(f1) +
float64(remains)*f2*math.Log(f2))
}
if emax <= 0 {
continue
}
entropy := (logNwords - sumNLogN/floatNwords) / emax
if entropy < 0 {
entropy = 0
}
if entropy < minEntropy {
minEntropy = entropy
}
}
if minEntropy == math.MaxFloat64 {
return 1.0
}
return math.Round(minEntropy*10000) / 10000
}
// KmerEntropyFilter is a reusable entropy filter for batch processing.
// It pre-computes normalization tables and lookup values to avoid repeated
// allocation across millions of k-mers.
//
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
// Each goroutine must create its own instance via NewKmerEntropyFilter.
type KmerEntropyFilter struct {
k int
levelMax int
threshold float64
nLogN []float64
normTables [][]int
emaxValues []float64
logNwords []float64
// Pre-allocated frequency tables reused across Entropy() calls.
// One per word size (index 0 unused). Reset to zero before each use.
freqTables [][]int
}
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
//
// Parameters:
// - k: the k-mer size
// - levelMax: maximum sub-word size for entropy (typically 6)
// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
if levelMax >= k {
levelMax = k - 1
}
if levelMax < 1 {
levelMax = 1
}
nLogN := make([]float64, k+1)
for i := 1; i <= k; i++ {
nLogN[i] = float64(i) * math.Log(float64(i))
}
normTables := make([][]int, levelMax+1)
for ws := 1; ws <= levelMax; ws++ {
size := 1 << (ws * 2)
normTables[ws] = make([]int, size)
for code := 0; code < size; code++ {
normTables[ws][code] = int(NormalizeCircular(uint64(code), ws))
}
}
emaxValues := make([]float64, levelMax+1)
logNwords := make([]float64, levelMax+1)
for ws := 1; ws <= levelMax; ws++ {
nw := k - ws + 1
na := CanonicalCircularKmerCount(ws)
if nw < na {
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = math.Log(float64(nw))
} else {
cov := nw / na
remains := nw - (na * cov)
f1 := float64(cov) / float64(nw)
f2 := float64(cov+1) / float64(nw)
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
float64(remains)*f2*math.Log(f2))
}
}
// Pre-allocate frequency tables per word size
freqTables := make([][]int, levelMax+1)
for ws := 1; ws <= levelMax; ws++ {
freqTables[ws] = make([]int, 1<<(ws*2))
}
return &KmerEntropyFilter{
k: k,
levelMax: levelMax,
threshold: threshold,
nLogN: nLogN,
normTables: normTables,
emaxValues: emaxValues,
logNwords: logNwords,
freqTables: freqTables,
}
}
// Accept returns true if the k-mer has entropy strictly above the threshold.
// Low-complexity k-mers (entropy <= threshold) are rejected.
func (ef *KmerEntropyFilter) Accept(kmer uint64) bool {
return ef.Entropy(kmer) > ef.threshold
}
// Entropy computes the entropy for a single k-mer using pre-computed tables.
func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
k := ef.k
// Decode k-mer to DNA sequence
var seqBuf [32]byte
seq := DecodeKmer(kmer, k, seqBuf[:])
minEntropy := math.MaxFloat64
for ws := 1; ws <= ef.levelMax; ws++ {
nwords := k - ws + 1
if nwords < 1 {
continue
}
emax := ef.emaxValues[ws]
if emax <= 0 {
continue
}
// Count circular-canonical sub-word frequencies
tableSize := 1 << (ws * 2)
table := ef.freqTables[ws]
clear(table) // reset to zero
mask := (1 << (ws * 2)) - 1
normTable := ef.normTables[ws]
wordIndex := 0
for i := 0; i < ws-1; i++ {
wordIndex = (wordIndex << 2) + int(EncodeNucleotide(seq[i]))
}
for i, j := 0, ws-1; j < k; i, j = i+1, j+1 {
wordIndex = ((wordIndex << 2) & mask) + int(EncodeNucleotide(seq[j]))
normWord := normTable[wordIndex]
table[normWord]++
}
// Compute Shannon entropy
floatNwords := float64(nwords)
logNwords := ef.logNwords[ws]
var sumNLogN float64
for j := 0; j < tableSize; j++ {
n := table[j]
if n > 0 {
sumNLogN += ef.nLogN[n]
}
}
entropy := (logNwords - sumNLogN/floatNwords) / emax
if entropy < 0 {
entropy = 0
}
if entropy < minEntropy {
minEntropy = entropy
}
}
if minEntropy == math.MaxFloat64 {
return 1.0
}
return math.Round(minEntropy*10000) / 10000
}