Optimize low-complexity masking algorithm

This commit optimizes the low-complexity masking algorithm by:

1. Precomputing logarithm values and normalization tables to avoid repeated calculations
2. Replacing the MinMultiset-based sliding minimum with a more efficient deque-based implementation
3. Improving entropy calculation by using precomputed n*log(n) values
4. Simplifying the circular normalization process with precomputed tables
5. Removing unused imports and log statements

The changes significantly improve performance while maintaining the same masking behavior.
This commit is contained in:
Eric Coissac
2026-02-08 20:20:28 +01:00
parent c0ae49ef92
commit 99a8e69d10
2 changed files with 143 additions and 233 deletions

View File

@@ -8,8 +8,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
// MaskingMode defines how to handle low-complexity regions
@@ -37,62 +35,81 @@ const (
// Returns: a SeqWorker function that can be applied to each sequence
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
// ========================================================================
// FUNCTION 1: emax - Calculate theoretical maximum entropy
// ========================================================================
// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
//
// Maximum entropy depends on the theoretical optimal word distribution:
// - If we have more positions (nw) than possible canonical words (na),
// some words will appear multiple times
// - We calculate the entropy of a distribution where all words appear
// cov or cov+1 times (most uniform distribution possible)
//
// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
emax := func(lseq, word_size int) float64 {
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization
// Case 1: Fewer positions than possible words
// Maximum entropy is simply log(nw) since we can have at most nw different words
if nw < na {
return math.Log(float64(nw))
nLogN := make([]float64, kmer_size+1)
for i := 1; i <= kmer_size; i++ {
nLogN[i] = float64(i) * math.Log(float64(i))
}
// Case 2: More positions than possible words
// Some words must appear multiple times
cov := nw / na // Average coverage (average number of occurrences per word)
remains := nw - (na * cov) // Number of words that will have one additional occurrence
normTables := make([][]int, level_max+1)
for ws := 1; ws <= level_max; ws++ {
size := 1 << (ws * 2)
normTables[ws] = make([]int, size)
for code := 0; code < size; code++ {
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
}
}
// Calculate frequencies in the optimal distribution:
// - (na - remains) words appear cov times → frequency f1 = cov/nw
// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
type pair struct {
index int
value float64
}
slidingMin := func(data []float64, window int) {
if len(data) == 0 || window <= 0 {
return
}
if window >= len(data) {
minVal := data[0]
for i := 1; i < len(data); i++ {
if data[i] < minVal {
minVal = data[i]
}
}
for i := range data {
data[i] = minVal
}
return
}
deque := make([]pair, 0, window)
for i, v := range data {
for len(deque) > 0 && deque[0].index <= i-window {
deque = deque[1:]
}
for len(deque) > 0 && deque[len(deque)-1].value >= v {
deque = deque[:len(deque)-1]
}
deque = append(deque, pair{index: i, value: v})
data[i] = deque[0].value
}
}
emaxValues := make([]float64, level_max+1)
logNwords := make([]float64, level_max+1)
for ws := 1; ws <= level_max; ws++ {
nw := kmer_size - ws + 1
na := obikmer.CanonicalCircularKmerCount(ws)
if nw < na {
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = math.Log(float64(nw))
} else {
cov := nw / na
remains := nw - (na * cov)
f1 := float64(cov) / float64(nw)
f2 := float64(cov+1) / float64(nw)
// Shannon entropy: H = -Σ p(i) * log(p(i))
// where p(i) is the probability of observing word i
return -(float64(na-remains)*f1*math.Log(f1) +
logNwords[ws] = math.Log(float64(nw))
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
float64(remains)*f2*math.Log(f2))
}
}
// ========================================================================
// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
// ========================================================================
// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
// all k-mers that contain them.
//
// Returns: a slice where maskPositions[i] = -1 if position i is part of a
// k-mer containing an ambiguity, 0 otherwise
maskAmbiguities := func(sequence []byte) []int {
maskPositions := make([]int, len(sequence))
for i, nuc := range sequence {
// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
// Mark all positions of k-mers that contain this nucleotide
// A k-mer starting at position (i - kmer_size + 1) will contain position i
end := max(0, i-kmer_size+1)
for j := i; j >= end; j-- {
maskPositions[j] = -1
@@ -102,182 +119,87 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
return maskPositions
}
// ========================================================================
// FUNCTION 3: cleanTable - Reset a frequency table to zero
// ========================================================================
cleanTable := func(table []int, over int) {
for i := 0; i < over; i++ {
table[i] = 0
}
}
// ========================================================================
// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
// ========================================================================
// Applies a sliding window of size window over data and replaces each
// value with the minimum in the window centered on that position.
//
// Uses a MinMultiset to efficiently maintain the minimum in the window.
slidingMin := func(data []float64, window int) {
minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
ldata := len(data)
mem := make([]float64, window) // Circular buffer to store window values
// Initialize buffer with sentinel value
for i := range mem {
mem[i] = 10000
}
for i, v := range data {
// Get the old value leaving the window
m := mem[i%window]
mem[i%window] = v
// Remove old value from multiset if it was valid
if m < 10000 {
minimier.RemoveOne(m)
}
// Add new value if full window is ahead of us
if (ldata - i) >= window {
minimier.Add(v)
}
// log.Warnf("taille du minimier %d @ %d", minimier.Len(), i)
// Retrieve and store current minimum
var ok bool
if data[i], ok = minimier.Min(); !ok {
log.Error("problem with minimum entropy")
data[i] = 0.0
}
//xx, _ := minimier.Min()
//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
}
}
// ========================================================================
// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
// ========================================================================
// This is the central function that calculates the entropy of each k-mer in the sequence
// at a given scale (wordSize).
//
// Algorithm:
// 1. Encode the sequence into words (subsequences of size wordSize)
// 2. For each k-mer, count the frequencies of words it contains
// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
// 4. Apply a sliding min filter to smooth results
//
// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
// This means "atg", "tga", and "gat" are considered the same word.
computeEntropies := func(sequence []byte,
maskPositions []int, // Positions of ambiguities
entropies []float64, // Output: normalized entropies for each position
table []int, // Frequency table for words (reused between calls)
words []int, // Buffer to store encoded words (reused)
wordSize int) { // Word size (scale of analysis)
maskPositions []int,
entropies []float64,
table []int,
words []int,
wordSize int,
normTable []int) {
lseq := len(sequence) // Sequence length
tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1)
nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
lseq := len(sequence)
tableSize := 1 << (wordSize * 2)
nwords := kmer_size - wordSize + 1
float_nwords := float64(nwords)
log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation
entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
log_nwords := logNwords[wordSize]
entropyMax := emaxValues[wordSize]
// Reset frequency table (must clear entire table, not just nalpha entries)
cleanTable(table, tableSize)
for i := 1; i < lseq; i++ {
entropies[i] = 6
}
end := lseq - wordSize + 1 // Last position where a word can start
end := lseq - wordSize + 1
// ========================================================================
// STEP 1: Encode all words in the sequence
// ========================================================================
// Uses left-shift encoding: each nucleotide is encoded on 2 bits
// a=00, c=01, g=10, t=11
mask := (1 << (wordSize * 2)) - 1
mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
// Initialize first word (all nucleotides except the last one)
word_index := 0
for i := 0; i < wordSize-1; i++ {
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
}
// Encode all words with sliding window
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
// Shift left by 2 bits, mask, and add new nucleotide
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
// *** CIRCULAR NORMALIZATION ***
// Convert word to its canonical form (smallest by circular rotation)
// This is where "atg", "tga", "gat" all become "atg"
// Now using uint64-based NormalizeCircular for better performance
words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
words[i] = normTable[word_index]
}
// ========================================================================
// STEP 2: Calculate entropy for each k-mer with sliding window
// ========================================================================
s := 0 // Number of words processed in current k-mer
sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
entropy := 1.0 // Current normalized entropy
cleaned := true // Flag indicating if table has been cleaned
s := 0
sum_n_logn := 0.0
entropy := 1.0
cleaned := true
for i := range end {
s++
switch {
// CASE 1: Filling phase (fewer than nwords words collected)
case s < nwords:
cleaned = false
table[words[i]]++ // Increment word frequency
table[words[i]]++
// CASE 2: Position contains an ambiguity
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
entropies[i-nwords+1] = 4.0
if !cleaned {
cleanTable(table, tableSize) // Reset table
cleanTable(table, tableSize)
}
cleaned = true
s = 0
sum_n_logn = 0.0
// CASE 3: First complete k-mer (s == nwords)
case s == nwords:
cleaned = false
table[words[i]]++
// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
// = log(N) - (1/N)*Σ n(i)*log(n(i))
// where N = nwords, n(i) = frequency of word i
//
// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
// so we must scan the full table even though only ~10 entries will be non-zero
sum_n_logn = 0
for j := range tableSize {
n := float64(table[j])
if n > 0 {
sum_n_logn += n * math.Log(n)
sum_n_logn += nLogN[int(n)]
}
}
// Normalized entropy = observed entropy / maximum entropy
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
// CASE 4: Sliding window (s > nwords)
// Incremental update of entropy by adding a new word
// and removing the old one
case s > nwords:
cleaned = false
new_word := words[i]
old_word := words[i-nwords]
// Optimization: only recalculate if word changes
if old_word != new_word {
table[new_word]++
table[old_word]--
@@ -285,59 +207,39 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
n_old := float64(table[old_word])
n_new := float64(table[new_word])
// Incremental update of sum_n_logn
// Remove contribution of old word (before decrement)
sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
// Add contribution of old word (after decrement)
sum_n_logn -= nLogN[int(n_old+1)]
if n_old > 0 {
sum_n_logn += n_old * math.Log(n_old)
sum_n_logn += nLogN[int(n_old)]
}
// Add contribution of new word (after increment)
if n_new > 0 {
sum_n_logn += n_new * math.Log(n_new)
sum_n_logn += nLogN[int(n_new)]
}
// Remove contribution of new word (before increment)
if n_new > 1 {
sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
sum_n_logn -= nLogN[int(n_new-1)]
}
}
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
}
// Store entropy for position corresponding to start of k-mer
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
if entropy < 0 {
entropy = 0
}
entropy = math.Round(entropy*10000) / 10000
entropies[i-nwords+1] = entropy
}
}
// ========================================================================
// STEP 3: Apply sliding min filter
// ========================================================================
// Replace each entropy with minimum in window of size kmer_size
// This allows robust detection of low-complexity regions
slidingMin(entropies, kmer_size)
// log.Warnf("%v\n%v", e, entropies)
}
// ========================================================================
// FUNCTION 6: applyMaskMode - Apply masking to sequence
// ========================================================================
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
// Create copy to avoid modifying original
seqCopy := sequence.Copy()
sequenceBytes := seqCopy.Sequence()
// Mask identified positions
for i := range sequenceBytes {
if maskPositions[i] {
// Operation &^ 32 converts to UPPERCASE (clears bit 5)
// sequenceBytes[i] = sequenceBytes[i] &^ 32
sequenceBytes[i] = mask
}
}
@@ -368,7 +270,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
}
// Handle the case where we end in a masked region
if inlow && fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
@@ -403,7 +304,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
}
// Handle the case where we end in an unmasked region
if inhigh && fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
@@ -415,11 +315,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
return *rep, nil
}
// ========================================================================
// FUNCTION 7: masking - Main masking function
// ========================================================================
// Calculates entropies at all scales and masks positions
// whose minimum entropy is below the threshold.
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
if sequence.Len() < kmer_size {
sequence.SetAttribute("obilowmask_error", "Sequence too short")
@@ -432,45 +327,27 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
bseq := sequence.Sequence()
// Identify ambiguities
maskPositions := maskAmbiguities(bseq)
// Initialize data structures
mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy
entropies := make([]float64, len(bseq)) // Minimum entropy at each position
mask := make([]int, len(bseq))
entropies := make([]float64, len(bseq))
for i := range entropies {
entropies[i] = 4.0 // Very high initial value
entropies[i] = 4.0
}
freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
words := make([]int, len(bseq)) // Buffer for encoded words
freqs := make([]int, 1<<(2*level_max))
words := make([]int, len(bseq))
entropies2 := make([]float64, len(bseq))
// ========================================================================
// Calculate entropy at maximum scale (level_max)
// ========================================================================
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
// Initialize mask with level_max everywhere (except ambiguities)
for i := range bseq {
v := level_max
// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
// v = 0
// }
mask[i] = v
}
// ========================================================================
// Calculate entropy at lower scales
// ========================================================================
entropies2 := make([]float64, len(bseq))
for ws := level_max - 1; ws > 0; ws-- {
// *** WARNING: POTENTIAL BUG ***
// The parameter passed is level_max instead of ws!
// This means we always recalculate with the same scale
// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
// Keep minimum entropy and corresponding scale
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
for i, e2 := range entropies2 {
if e2 < entropies[i] {
entropies[i] = e2
@@ -479,22 +356,17 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
}
// Force entropy to 0 for ambiguous positions
for i, nuc := range bseq {
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
entropies[i] = 0
}
}
// ========================================================================
// Identify positions to mask
// ========================================================================
remove := make([]bool, len(entropies))
for i, e := range entropies {
remove[i] = e <= threshold
}
// Save metadata in sequence attributes
sequence.SetAttribute("mask", mask)
sequence.SetAttribute("Entropies", entropies)
@@ -527,9 +399,7 @@ func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequenc
CLIMaskingChar(),
)
// Apply worker in parallel
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
// Filter resulting empty sequences
return newIter.FilterEmpty()
}

View File

@@ -0,0 +1,40 @@
package obilowmask
import (
"testing"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func TestLowMaskWorker(t *testing.T) {
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
result, err := worker(seq)
if err != nil {
t.Fatalf("Worker failed: %v", err)
}
if result.Len() != 1 {
t.Fatalf("Expected 1 sequence, got %d", result.Len())
}
resultSeq := result[0]
if resultSeq.Len() != 32 {
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
}
}
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
result, err := worker(seq)
if err != nil {
t.Fatalf("Worker failed: %v", err)
}
if result.Len() != 1 {
t.Fatalf("Expected 1 sequence, got %d", result.Len())
}
}