diff --git a/pkg/obitools/obilowmask/obilowmask.go b/pkg/obitools/obilowmask/obilowmask.go index b8ba28d..893cab2 100644 --- a/pkg/obitools/obilowmask/obilowmask.go +++ b/pkg/obitools/obilowmask/obilowmask.go @@ -8,8 +8,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" - log "github.com/sirupsen/logrus" ) // MaskingMode defines how to handle low-complexity regions @@ -37,62 +35,81 @@ const ( // Returns: a SeqWorker function that can be applied to each sequence func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker { - // ======================================================================== - // FUNCTION 1: emax - Calculate theoretical maximum entropy - // ======================================================================== - // Computes the maximum entropy of a k-mer of length lseq containing words of size word_size. - // - // Maximum entropy depends on the theoretical optimal word distribution: - // - If we have more positions (nw) than possible canonical words (na), - // some words will appear multiple times - // - We calculate the entropy of a distribution where all words appear - // cov or cov+1 times (most uniform distribution possible) - // - // IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words - // after circular normalization (e.g., "atg", "tga", "gat" → all "atg"). - // This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2). - emax := func(lseq, word_size int) float64 { - nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq - na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization - - // Case 1: Fewer positions than possible words - // Maximum entropy is simply log(nw) since we can have at most nw different words - if nw < na { - return math.Log(float64(nw)) - } - - // Case 2: More positions than possible words - // Some words must appear multiple times - cov := nw / na // Average coverage (average number of occurrences per word) - remains := nw - (na * cov) // Number of words that will have one additional occurrence - - // Calculate frequencies in the optimal distribution: - // - (na - remains) words appear cov times → frequency f1 = cov/nw - // - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw - f1 := float64(cov) / float64(nw) - f2 := float64(cov+1) / float64(nw) - - // Shannon entropy: H = -Σ p(i) * log(p(i)) - // where p(i) is the probability of observing word i - return -(float64(na-remains)*f1*math.Log(f1) + - float64(remains)*f2*math.Log(f2)) + nLogN := make([]float64, kmer_size+1) + for i := 1; i <= kmer_size; i++ { + nLogN[i] = float64(i) * math.Log(float64(i)) + } + + normTables := make([][]int, level_max+1) + for ws := 1; ws <= level_max; ws++ { + size := 1 << (ws * 2) + normTables[ws] = make([]int, size) + for code := 0; code < size; code++ { + normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws)) + } + } + + type pair struct { + index int + value float64 + } + + slidingMin := func(data []float64, window int) { + if len(data) == 0 || window <= 0 { + return + } + if window >= len(data) { + minVal := data[0] + for i := 1; i < len(data); i++ { + if data[i] < minVal { + minVal = data[i] + } + } + for i := range data { + data[i] = minVal + } + return + } + + deque := make([]pair, 0, window) + + for i, v := range data { + for len(deque) > 0 && deque[0].index <= i-window { + deque = deque[1:] + } + + for len(deque) > 0 && deque[len(deque)-1].value >= v { + deque = deque[:len(deque)-1] + } + + deque = append(deque, pair{index: i, value: v}) + + data[i] = deque[0].value + } + } + emaxValues := make([]float64, level_max+1) + logNwords := make([]float64, level_max+1) + for ws := 1; ws <= level_max; ws++ { + nw := kmer_size - ws + 1 + na := obikmer.CanonicalCircularKmerCount(ws) + if nw < na { + logNwords[ws] = math.Log(float64(nw)) + emaxValues[ws] = math.Log(float64(nw)) + } else { + cov := nw / na + remains := nw - (na * cov) + f1 := float64(cov) / float64(nw) + f2 := float64(cov+1) / float64(nw) + logNwords[ws] = math.Log(float64(nw)) + emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) + + float64(remains)*f2*math.Log(f2)) + } } - // ======================================================================== - // FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities - // ======================================================================== - // Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks - // all k-mers that contain them. - // - // Returns: a slice where maskPositions[i] = -1 if position i is part of a - // k-mer containing an ambiguity, 0 otherwise maskAmbiguities := func(sequence []byte) []int { maskPositions := make([]int, len(sequence)) for i, nuc := range sequence { - // If nucleotide is not a, c, g or t (lowercase), it's an ambiguity if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' { - // Mark all positions of k-mers that contain this nucleotide - // A k-mer starting at position (i - kmer_size + 1) will contain position i end := max(0, i-kmer_size+1) for j := i; j >= end; j-- { maskPositions[j] = -1 @@ -102,182 +119,87 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking return maskPositions } - // ======================================================================== - // FUNCTION 3: cleanTable - Reset a frequency table to zero - // ======================================================================== cleanTable := func(table []int, over int) { for i := 0; i < over; i++ { table[i] = 0 } } - // ======================================================================== - // FUNCTION 4: slidingMin - Calculate sliding minimum over a window - // ======================================================================== - // Applies a sliding window of size window over data and replaces each - // value with the minimum in the window centered on that position. - // - // Uses a MinMultiset to efficiently maintain the minimum in the window. - slidingMin := func(data []float64, window int) { - minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b }) - ldata := len(data) - mem := make([]float64, window) // Circular buffer to store window values - - // Initialize buffer with sentinel value - for i := range mem { - mem[i] = 10000 - } - - for i, v := range data { - // Get the old value leaving the window - m := mem[i%window] - mem[i%window] = v - - // Remove old value from multiset if it was valid - if m < 10000 { - minimier.RemoveOne(m) - } - - // Add new value if full window is ahead of us - if (ldata - i) >= window { - minimier.Add(v) - } - - // log.Warnf("taille du minimier %d @ %d", minimier.Len(), i) - - // Retrieve and store current minimum - var ok bool - if data[i], ok = minimier.Min(); !ok { - log.Error("problem with minimum entropy") - data[i] = 0.0 - } - - //xx, _ := minimier.Min() - //log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx) - } - } - - // ======================================================================== - // FUNCTION 5: computeEntropies - Calculate normalized entropy for each position - // ======================================================================== - // This is the central function that calculates the entropy of each k-mer in the sequence - // at a given scale (wordSize). - // - // Algorithm: - // 1. Encode the sequence into words (subsequences of size wordSize) - // 2. For each k-mer, count the frequencies of words it contains - // 3. Calculate normalized entropy = observed_entropy / maximum_entropy - // 4. Apply a sliding min filter to smooth results - // - // IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words! - // This means "atg", "tga", and "gat" are considered the same word. computeEntropies := func(sequence []byte, - maskPositions []int, // Positions of ambiguities - entropies []float64, // Output: normalized entropies for each position - table []int, // Frequency table for words (reused between calls) - words []int, // Buffer to store encoded words (reused) - wordSize int) { // Word size (scale of analysis) + maskPositions []int, + entropies []float64, + table []int, + words []int, + wordSize int, + normTable []int) { - lseq := len(sequence) // Sequence length - tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1) - nwords := kmer_size - wordSize + 1 // Number of words in a k-mer + lseq := len(sequence) + tableSize := 1 << (wordSize * 2) + nwords := kmer_size - wordSize + 1 float_nwords := float64(nwords) - log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation - entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally) + log_nwords := logNwords[wordSize] + entropyMax := emaxValues[wordSize] - // Reset frequency table (must clear entire table, not just nalpha entries) cleanTable(table, tableSize) for i := 1; i < lseq; i++ { entropies[i] = 6 } - end := lseq - wordSize + 1 // Last position where a word can start + end := lseq - wordSize + 1 - // ======================================================================== - // STEP 1: Encode all words in the sequence - // ======================================================================== - // Uses left-shift encoding: each nucleotide is encoded on 2 bits - // a=00, c=01, g=10, t=11 + mask := (1 << (wordSize * 2)) - 1 - mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits - - // Initialize first word (all nucleotides except the last one) word_index := 0 for i := 0; i < wordSize-1; i++ { word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i])) } - // Encode all words with sliding window for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 { - // Shift left by 2 bits, mask, and add new nucleotide word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j])) - - // *** CIRCULAR NORMALIZATION *** - // Convert word to its canonical form (smallest by circular rotation) - // This is where "atg", "tga", "gat" all become "atg" - // Now using uint64-based NormalizeCircular for better performance - words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize)) + words[i] = normTable[word_index] } - // ======================================================================== - // STEP 2: Calculate entropy for each k-mer with sliding window - // ======================================================================== - s := 0 // Number of words processed in current k-mer - sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation - entropy := 1.0 // Current normalized entropy - cleaned := true // Flag indicating if table has been cleaned + s := 0 + sum_n_logn := 0.0 + entropy := 1.0 + cleaned := true for i := range end { s++ switch { - // CASE 1: Filling phase (fewer than nwords words collected) case s < nwords: cleaned = false - table[words[i]]++ // Increment word frequency + table[words[i]]++ - // CASE 2: Position contains an ambiguity case i >= (nwords-1) && maskPositions[i-nwords+1] < 0: - entropies[i-nwords+1] = 4.0 // Mark entropy as invalid + entropies[i-nwords+1] = 4.0 if !cleaned { - cleanTable(table, tableSize) // Reset table + cleanTable(table, tableSize) } cleaned = true s = 0 sum_n_logn = 0.0 - // CASE 3: First complete k-mer (s == nwords) case s == nwords: cleaned = false table[words[i]]++ - // Calculate Shannon entropy: H = -Σ p(i)*log(p(i)) - // = log(N) - (1/N)*Σ n(i)*log(n(i)) - // where N = nwords, n(i) = frequency of word i - // - // NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies. - // Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15}) - // so we must scan the full table even though only ~10 entries will be non-zero sum_n_logn = 0 for j := range tableSize { n := float64(table[j]) if n > 0 { - sum_n_logn += n * math.Log(n) + sum_n_logn += nLogN[int(n)] } } - // Normalized entropy = observed entropy / maximum entropy entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax - // CASE 4: Sliding window (s > nwords) - // Incremental update of entropy by adding a new word - // and removing the old one case s > nwords: cleaned = false new_word := words[i] old_word := words[i-nwords] - // Optimization: only recalculate if word changes if old_word != new_word { table[new_word]++ table[old_word]-- @@ -285,59 +207,39 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking n_old := float64(table[old_word]) n_new := float64(table[new_word]) - // Incremental update of sum_n_logn - // Remove contribution of old word (before decrement) - sum_n_logn -= (n_old + 1) * math.Log(n_old+1) - // Add contribution of old word (after decrement) + sum_n_logn -= nLogN[int(n_old+1)] if n_old > 0 { - sum_n_logn += n_old * math.Log(n_old) + sum_n_logn += nLogN[int(n_old)] } - // Add contribution of new word (after increment) if n_new > 0 { - sum_n_logn += n_new * math.Log(n_new) + sum_n_logn += nLogN[int(n_new)] } - // Remove contribution of new word (before increment) if n_new > 1 { - sum_n_logn -= (n_new - 1) * math.Log(n_new-1) + sum_n_logn -= nLogN[int(n_new-1)] } } entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax } - // Store entropy for position corresponding to start of k-mer if s >= nwords && maskPositions[i-nwords+1] >= 0 { if entropy < 0 { entropy = 0 - } entropy = math.Round(entropy*10000) / 10000 entropies[i-nwords+1] = entropy } } - // ======================================================================== - // STEP 3: Apply sliding min filter - // ======================================================================== - // Replace each entropy with minimum in window of size kmer_size - // This allows robust detection of low-complexity regions slidingMin(entropies, kmer_size) - // log.Warnf("%v\n%v", e, entropies) } - // ======================================================================== - // FUNCTION 6: applyMaskMode - Apply masking to sequence - // ======================================================================== applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) { - // Create copy to avoid modifying original seqCopy := sequence.Copy() sequenceBytes := seqCopy.Sequence() - // Mask identified positions for i := range sequenceBytes { if maskPositions[i] { - // Operation &^ 32 converts to UPPERCASE (clears bit 5) - // sequenceBytes[i] = sequenceBytes[i] &^ 32 sequenceBytes[i] = mask } } @@ -368,7 +270,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } } - // Handle the case where we end in a masked region if inlow && fromlow >= 0 { frg, err := sequence.Subsequence(fromlow, len(maskPosition), false) if err != nil { @@ -403,7 +304,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } } - // Handle the case where we end in an unmasked region if inhigh && fromhigh >= 0 { frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false) if err != nil { @@ -415,11 +315,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking return *rep, nil } - // ======================================================================== - // FUNCTION 7: masking - Main masking function - // ======================================================================== - // Calculates entropies at all scales and masks positions - // whose minimum entropy is below the threshold. masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { if sequence.Len() < kmer_size { sequence.SetAttribute("obilowmask_error", "Sequence too short") @@ -432,45 +327,27 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking bseq := sequence.Sequence() - // Identify ambiguities maskPositions := maskAmbiguities(bseq) - // Initialize data structures - mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy - entropies := make([]float64, len(bseq)) // Minimum entropy at each position + mask := make([]int, len(bseq)) + entropies := make([]float64, len(bseq)) for i := range entropies { - entropies[i] = 4.0 // Very high initial value + entropies[i] = 4.0 } - freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size) - words := make([]int, len(bseq)) // Buffer for encoded words + freqs := make([]int, 1<<(2*level_max)) + words := make([]int, len(bseq)) + entropies2 := make([]float64, len(bseq)) - // ======================================================================== - // Calculate entropy at maximum scale (level_max) - // ======================================================================== - computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max) + computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max]) - // Initialize mask with level_max everywhere (except ambiguities) for i := range bseq { v := level_max - // if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' { - // v = 0 - // } mask[i] = v } - // ======================================================================== - // Calculate entropy at lower scales - // ======================================================================== - entropies2 := make([]float64, len(bseq)) - for ws := level_max - 1; ws > 0; ws-- { - // *** WARNING: POTENTIAL BUG *** - // The parameter passed is level_max instead of ws! - // This means we always recalculate with the same scale - // Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws) - computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws) - // Keep minimum entropy and corresponding scale + computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws]) for i, e2 := range entropies2 { if e2 < entropies[i] { entropies[i] = e2 @@ -479,22 +356,17 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking } } - // Force entropy to 0 for ambiguous positions for i, nuc := range bseq { if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' { entropies[i] = 0 } } - // ======================================================================== - // Identify positions to mask - // ======================================================================== remove := make([]bool, len(entropies)) for i, e := range entropies { remove[i] = e <= threshold } - // Save metadata in sequence attributes sequence.SetAttribute("mask", mask) sequence.SetAttribute("Entropies", entropies) @@ -527,9 +399,7 @@ func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequenc CLIMaskingChar(), ) - // Apply worker in parallel newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers()) - // Filter resulting empty sequences return newIter.FilterEmpty() } diff --git a/pkg/obitools/obilowmask/obilowmask_test.go b/pkg/obitools/obilowmask/obilowmask_test.go new file mode 100644 index 0000000..4afd764 --- /dev/null +++ b/pkg/obitools/obilowmask/obilowmask_test.go @@ -0,0 +1,40 @@ +package obilowmask + +import ( + "testing" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func TestLowMaskWorker(t *testing.T) { + worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') + + seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test") + result, err := worker(seq) + if err != nil { + t.Fatalf("Worker failed: %v", err) + } + + if result.Len() != 1 { + t.Fatalf("Expected 1 sequence, got %d", result.Len()) + } + + resultSeq := result[0] + if resultSeq.Len() != 32 { + t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len()) + } +} + +func TestLowMaskWorkerWithAmbiguity(t *testing.T) { + worker := LowMaskWorker(31, 6, 0.3, Mask, 'n') + + seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test") + result, err := worker(seq) + if err != nil { + t.Fatalf("Worker failed: %v", err) + } + + if result.Len() != 1 { + t.Fatalf("Expected 1 sequence, got %d", result.Len()) + } +}