implementation de obilowmask

2026-06-24 09:41:00 +00:00 · 2025-10-20 17:43:25 +02:00
parent 07cdd6f758
commit 4603d7973e
13 changed files with 12636 additions and 2 deletions
@@ -0,0 +1,319 @@
+```{r}
+library(tidyverse)
+```
+
+```{r}
+x <- sample(1:4096, 29, replace=TRUE)
+```
+
+```{r}
+emax <- function(lseq,word_size) {
+  nword = lseq - word_size + 1
+  nalpha = 4^word_size
+  
+  if (nalpha < nword) {
+    cov = nword %/% nalpha
+    remains = nword %% nalpha   
+    f1 = cov/nword
+    f2 = (cov+1)/nword
+    print(c(nalpha - remains,f1,remains,f2))
+    e = -(nalpha - remains) * f1 * log(f1) -  
+         remains * f2 * log(f2)
+  } else {
+    e = log(nword)
+  }
+  
+  e
+}
+```
+
+```{r}
+ec <- function(data,kmer_size) {
+  table <- table(data)
+  s <- sum(table)
+  e <- sum(table * log(table))/s
+  ed <- log(s) - e
+  
+  em <- emax(s+kmer_size-1,kmer_size)
+  
+  ed/em
+}
+```
+
+```{r}
+ef <- function(data,kmer_size) {
+  table <- table(data)
+  s <- sum(table)
+  f <- table / s
+  
+  f <- as.numeric(f)
+  f <- f[f > 0]
+  
+  em <- emax(s+kmer_size-1,kmer_size)
+  ed <- -sum(f * log(f))
+  
+  print(c(ed,em,ed/em))
+  
+  ed/em
+}
+```
+
+```{r}
+okmer <- function(data,kmer_size) {
+    str_sub(data,1:(nchar(data)-kmer_size+1)) %>%
+    str_sub(1,kmer_size)
+}
+```
+
+```{r}
+# Normalisation circulaire: retourne le plus petit k-mer par rotation circulaire
+normalize_circular <- function(kmer) {
+  if (nchar(kmer) == 0) return(kmer)
+
+  canonical <- kmer
+  n <- nchar(kmer)
+
+  # Tester toutes les rotations circulaires
+  for (i in 2:n) {
+    rotated <- paste0(str_sub(kmer, i, n), str_sub(kmer, 1, i-1))
+    if (rotated < canonical) {
+      canonical <- rotated
+    }
+  }
+
+  canonical
+}
+```
+
+```{r}
+# Fonction totient d'Euler: compte le nombre d'entiers de 1 à n coprimes avec n
+euler_totient <- function(n) {
+  if (n <= 0) return(0)
+
+  result <- n
+  p <- 2
+
+  # Traiter tous les facteurs premiers
+  while (p * p <= n) {
+    if (n %% p == 0) {
+      # Retirer toutes les occurrences de p
+      while (n %% p == 0) {
+        n <- n %/% p
+      }
+      # Appliquer la formule: φ(n) = n * (1 - 1/p)
+      result <- result - result %/% p
+    }
+    p <- p + 1
+  }
+
+  # Si n est toujours > 1, alors c'est un facteur premier
+  if (n > 1) {
+    result <- result - result %/% n
+  }
+
+  result
+}
+```
+
+```{r}
+# Retourne tous les diviseurs de n
+divisors <- function(n) {
+  if (n <= 0) return(integer(0))
+
+  divs <- c()
+  i <- 1
+  while (i * i <= n) {
+    if (n %% i == 0) {
+      divs <- c(divs, i)
+      if (i != n %/% i) {
+        divs <- c(divs, n %/% i)
+      }
+    }
+    i <- i + 1
+  }
+
+  sort(divs)
+}
+```
+
+```{r}
+# Compte le nombre de colliers (necklaces) distincts de longueur n
+# sur un alphabet de taille a en utilisant la formule de Moreau:
+# N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
+# où la somme est sur tous les diviseurs d de n, et φ est la fonction totient d'Euler
+necklace_count <- function(n, alphabet_size) {
+  if (n <= 0) return(0)
+
+  divs <- divisors(n)
+  sum_val <- 0
+
+  for (d in divs) {
+    # Calculer alphabet_size^(n/d)
+    power <- alphabet_size^(n %/% d)
+    sum_val <- sum_val + euler_totient(d) * power
+  }
+
+  sum_val %/% n
+}
+```
+
+```{r}
+# Nombre de classes d'équivalence pour les k-mers normalisés
+# Utilise la formule exacte de Moreau pour compter les colliers (necklaces)
+n_normalized_kmers <- function(kmer_size) {
+  # Valeurs exactes pré-calculées pour k=1 à 6
+  if (kmer_size == 1) return(4)
+  if (kmer_size == 2) return(10)
+  if (kmer_size == 3) return(24)
+  if (kmer_size == 4) return(70)
+  if (kmer_size == 5) return(208)
+  if (kmer_size == 6) return(700)
+
+  # Pour k > 6, utiliser la formule de Moreau (exacte)
+  # Alphabet ADN a 4 bases
+  necklace_count(kmer_size, 4)
+}
+```
+
+```{r}
+# Entropie maximale pour k-mers normalisés
+enmax <- function(lseq, word_size) {
+  nword = lseq - word_size + 1
+  nalpha = n_normalized_kmers(word_size)
+
+  if (nalpha < nword) {
+    cov = nword %/% nalpha
+    remains = nword %% nalpha
+    f1 = cov/nword
+    f2 = (cov+1)/nword
+    e = -(nalpha - remains) * f1 * log(f1) -
+         remains * f2 * log(f2)
+  } else {
+    e = log(nword)
+  }
+
+  e
+}
+```
+
+```{r}
+# Entropie normalisée avec normalisation circulaire des k-mers
+ecn <- function(data, kmer_size) {
+  # Normaliser tous les k-mers
+  normalized_data <- sapply(data, normalize_circular)
+
+  # Calculer la table des fréquences
+  table <- table(normalized_data)
+  s <- sum(table)
+  e <- sum(table * log(table))/s
+  ed <- log(s) - e
+
+  # Entropie maximale avec normalisation
+  em <- enmax(s + kmer_size - 1, kmer_size)
+
+  ed/em
+}
+```
+
+```{r}
+k<-'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
+ec(okmer(k,1),1)
+ec(okmer(k,2),2)
+ec(okmer(k,3),3)
+ec(okmer(k,4),4)
+```
+
+```{r}
+k<-'atatatatatatatatatatatatatatata'
+ef(okmer(k,1),1)
+ef(okmer(k,2),2)
+ef(okmer(k,3),3)
+ef(okmer(k,4),4)
+```
+
+```{r}
+k<-'aaaaaaaaaaaaaaaattttttttttttttt'
+ef(okmer(k,1),1)
+ef(okmer(k,2),2)
+ef(okmer(k,3),3)
+ef(okmer(k,4),4)
+```
+
+```{r}
+k<-'atgatgatgatgatgatgatgatgatgatga'
+ef(okmer(k,1),1)
+ef(okmer(k,2),2)
+ef(okmer(k,3),3)
+ef(okmer(k,4),4)
+```
+
+```{r}
+k<-'atcgatcgatcgatcgatcgatcgatcgact'
+ecn(okmer(k,1),1)
+ecn(okmer(k,2),2)
+ecn(okmer(k,3),3)
+ecn(okmer(k,4),4)
+```
+
+```{r}
+k<-paste(sample(rep(c("a","c","g","t"),8),31),collapse="")
+k <- "actatggcaagtcgtaaccgcgcttatcagg"
+ecn(okmer(k,1),1)
+ecn(okmer(k,2),2)
+ecn(okmer(k,3),3)
+ecn(okmer(k,4),4)
+```
+
+aattaaaaaaacaagataaaataatattttt
+
+```{r}
+k<-'aattaaaaaaacaagataaaataatattttt'
+ecn(okmer(k,1),1)
+ecn(okmer(k,2),2)
+ecn(okmer(k,3),3)
+ecn(okmer(k,4),4)
+```
+
+atg tga gat ,,,,
+
+cat tca atc
+
+tgatgatgatgatgatgatgatgatgatg
+
+## Tests de normalisation circulaire
+
+```{r}
+# Test de la fonction de normalisation
+normalize_circular("ca")   # devrait donner "ac"
+normalize_circular("tgca") # devrait donner "atgc"
+normalize_circular("acgt") # devrait donner "acgt"
+```
+
+```{r}
+# Comparaison ec vs ecn sur une séquence répétitive
+# Les k-mers "atg", "tga", "gat" sont équivalents par rotation
+k <- 'atgatgatgatgatgatgatgatgatgatga'
+cat("Séquence:", k, "\n")
+cat("ec(k,3)  =", ec(okmer(k,3),3), "\n")
+cat("ecn(k,3) =", ecn(okmer(k,3),3), "\n")
+```
+
+```{r}
+# Comparaison sur séquence aléatoire
+k <- "actatggcaagtcgtaaccgcgcttatcagg"
+cat("Séquence:", k, "\n")
+cat("Sans normalisation:\n")
+cat("  ec(k,2) =", ec(okmer(k,2),2), "\n")
+cat("  ec(k,3) =", ec(okmer(k,3),3), "\n")
+cat("  ec(k,4) =", ec(okmer(k,4),4), "\n")
+cat("Avec normalisation circulaire:\n")
+cat("  ecn(k,2) =", ecn(okmer(k,2),2), "\n")
+cat("  ecn(k,3) =", ecn(okmer(k,3),3), "\n")
+cat("  ecn(k,4) =", ecn(okmer(k,4),4), "\n")
+```
+
+```{r}
+re <- rev(c(0.8108602271901116,0.8108602271901116,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.8041354757148719,0.7800272339058549,0.7800272339058549,0.7751610144606091,0.7751610144606091,0.7751610144606091,0.764858185548322,0.7325526601302021,0.7137620699527615,0.6789199521982864,0.6584536373623372,0.634002687184193,0.6075290415873623,0.5785545803330997,0.5785545803330997,0.5503220289212184,0.5315314387437778,0.4966893209893028,0.46077361820145696,0.42388221293245526,0.4009547969713408,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3561142883497758,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.3418776106000334,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.35141814451677883,0.390029016052137,0.42781461756157363,0.45192285937059073,0.47238917420654,0.47238917420654,0.47238917420654,0.5092805794755417,0.5451962822633876,0.5800384000178626,0.602395141014297,0.6046146614886381,0.6046146614886381,0.6119084258128231,0.6119084258128231,0.6214217106113492,0.6424704346756562,0.6482381543085467,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6828444721058894,0.6950205907027562,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.696103322070051,0.7208976112999935))
+
+di <- c(0.7208976112999935,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6961033220700509,0.6950205907027562,0.6828444721058894,0.6635191587399633,0.6635191587399633,0.6635191587399633,0.6482381543085467,0.6424704346756562,0.6214217106113492,0.6119084258128231,0.6119084258128231,0.6046146614886382,0.6046146614886382,0.6023951410142971,0.5800384000178627,0.5451962822633876,0.5092805794755418,0.47238917420654003,0.47238917420654003,0.47238917420654003,0.4519228593705908,0.4278146175615737,0.39002901605213713,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677894,0.35141814451677883,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3418776106000333,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.3561142883497762,0.40095479697134073,0.42388221293245526,0.46077361820145696,0.4966893209893028,0.5315314387437778,0.5503220289212184,0.5785545803330997,0.5785545803330997,0.6075290415873625,0.6340026871841933,0.6584536373623374,0.6789199521982866,0.7137620699527616,0.7325526601302023,0.7648581855483221,0.7751610144606093,0.7751610144606093,0.7751610144606093,0.7800272339058549,0.7800272339058549,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8041354757148721,0.8108602271901116,0.8108602271901116)
+```
@@ -0,0 +1,440 @@
+package obilowmask
+
+import (
+	"math"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
+	log "github.com/sirupsen/logrus"
+)
+
+// MaskingMode defines how to handle low-complexity regions
+type MaskingMode int
+
+const (
+	Mask  MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
+	Split                    // Split mode: split sequence into high-complexity fragments
+)
+
+// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
+//
+// Algorithm principle:
+// Calculate the normalized entropy of each k-mer at different scales (wordSize = 1 to level_max).
+// K-mers with entropy below the threshold are masked.
+//
+// Parameters:
+//   - kmer_size: size of the sliding window for entropy calculation
+//   - level_max: maximum word size used for entropy calculation (finest scale)
+//   - threshold: normalized entropy threshold below which masking occurs (between 0 and 1)
+//   - mode: Mask (masking) or Split (splitting)
+//   - maskChar: character used for masking (typically 'n' or 'N')
+//
+// Returns: a SeqWorker function that can be applied to each sequence
+func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
+
+	// ========================================================================
+	// FUNCTION 1: emax - Calculate theoretical maximum entropy
+	// ========================================================================
+	// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
+	//
+	// Maximum entropy depends on the theoretical optimal word distribution:
+	// - If we have more positions (nw) than possible canonical words (na),
+	//   some words will appear multiple times
+	// - We calculate the entropy of a distribution where all words appear
+	//   cov or cov+1 times (most uniform distribution possible)
+	//
+	// IMPORTANT: Uses CanonicalKmerCount to get the actual number of canonical words
+	// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
+	// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
+	emax := func(lseq, word_size int) float64 {
+		nw := lseq - word_size + 1                  // Number of words in a k-mer of length lseq
+		na := obikmer.CanonicalKmerCount(word_size) // Number of canonical words after normalization
+
+		// Case 1: Fewer positions than possible words
+		// Maximum entropy is simply log(nw) since we can have at most nw different words
+		if nw < na {
+			return math.Log(float64(nw))
+		}
+
+		// Case 2: More positions than possible words
+		// Some words must appear multiple times
+		cov := nw / na             // Average coverage (average number of occurrences per word)
+		remains := nw - (na * cov) // Number of words that will have one additional occurrence
+
+		// Calculate frequencies in the optimal distribution:
+		// - (na - remains) words appear cov times → frequency f1 = cov/nw
+		// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
+		f1 := float64(cov) / float64(nw)
+		f2 := float64(cov+1) / float64(nw)
+
+		// Shannon entropy: H = -Σ p(i) * log(p(i))
+		// where p(i) is the probability of observing word i
+		return -(float64(na-remains)*f1*math.Log(f1) +
+			float64(remains)*f2*math.Log(f2))
+	}
+
+	// ========================================================================
+	// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
+	// ========================================================================
+	// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
+	// all k-mers that contain them.
+	//
+	// Returns: a slice where maskPositions[i] = -1 if position i is part of a
+	//          k-mer containing an ambiguity, 0 otherwise
+	maskAmbiguities := func(sequence []byte) []int {
+		maskPositions := make([]int, len(sequence))
+		for i, nuc := range sequence {
+			// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
+			if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
+				// Mark all positions of k-mers that contain this nucleotide
+				// A k-mer starting at position (i - kmer_size + 1) will contain position i
+				end := max(0, i-kmer_size+1)
+				for j := i; j >= end; j-- {
+					maskPositions[j] = -1
+				}
+			}
+		}
+		return maskPositions
+	}
+
+	// ========================================================================
+	// FUNCTION 3: cleanTable - Reset a frequency table to zero
+	// ========================================================================
+	cleanTable := func(table []int, over int) {
+		for i := 0; i < over; i++ {
+			table[i] = 0
+		}
+	}
+
+	// ========================================================================
+	// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
+	// ========================================================================
+	// Applies a sliding window of size window over data and replaces each
+	// value with the minimum in the window centered on that position.
+	//
+	// Uses a MinMultiset to efficiently maintain the minimum in the window.
+	slidingMin := func(data []float64, window int) {
+		minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
+		ldata := len(data)
+		mem := make([]float64, window) // Circular buffer to store window values
+
+		// Initialize buffer with sentinel value
+		for i := range mem {
+			mem[i] = 10000
+		}
+
+		for i, v := range data {
+			// Get the old value leaving the window
+			m := mem[i%window]
+			mem[i%window] = v
+
+			// Remove old value from multiset if it was valid
+			if m < 10000 {
+				minimier.RemoveOne(m)
+			}
+
+			// Add new value if full window is ahead of us
+			if (ldata - i) >= window {
+				minimier.Add(v)
+			}
+
+			// Retrieve and store current minimum
+			var ok bool
+			if data[i], ok = minimier.Min(); !ok {
+				log.Error("problem with minimum entropy")
+				data[i] = 0.0
+			}
+
+			//xx, _ := minimier.Min()
+			//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
+		}
+	}
+
+	// ========================================================================
+	// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
+	// ========================================================================
+	// This is the central function that calculates the entropy of each k-mer in the sequence
+	// at a given scale (wordSize).
+	//
+	// Algorithm:
+	// 1. Encode the sequence into words (subsequences of size wordSize)
+	// 2. For each k-mer, count the frequencies of words it contains
+	// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
+	// 4. Apply a sliding min filter to smooth results
+	//
+	// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
+	// This means "atg", "tga", and "gat" are considered the same word.
+	computeEntropies := func(sequence []byte,
+		maskPositions []int, // Positions of ambiguities
+		entropies []float64, // Output: normalized entropies for each position
+		table []int, // Frequency table for words (reused between calls)
+		words []int, // Buffer to store encoded words (reused)
+		wordSize int) { // Word size (scale of analysis)
+
+		lseq := len(sequence)              // Sequence length
+		tableSize := 1 << (wordSize * 2)   // Actual table size (must fit all codes 0 to 4^wordSize-1)
+		nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
+		float_nwords := float64(nwords)
+		log_nwords := math.Log(float_nwords)    // log(nwords) used in entropy calculation
+		entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
+
+		// Reset frequency table (must clear entire table, not just nalpha entries)
+		cleanTable(table, tableSize)
+
+		for i := 1; i < lseq; i++ {
+			entropies[i] = 6
+		}
+		end := lseq - wordSize + 1 // Last position where a word can start
+
+		// ========================================================================
+		// STEP 1: Encode all words in the sequence
+		// ========================================================================
+		// Uses left-shift encoding: each nucleotide is encoded on 2 bits
+		// a=00, c=01, g=10, t=11
+
+		mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
+
+		// Initialize first word (all nucleotides except the last one)
+		word_index := 0
+		for i := 0; i < wordSize-1; i++ {
+			word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
+		}
+
+		// Encode all words with sliding window
+		for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
+			// Shift left by 2 bits, mask, and add new nucleotide
+			word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
+
+			// *** CIRCULAR NORMALIZATION ***
+			// Convert word to its canonical form (smallest by circular rotation)
+			// This is where "atg", "tga", "gat" all become "atg"
+			words[i] = obikmer.NormalizeInt(word_index, wordSize)
+		}
+
+		// ========================================================================
+		// STEP 2: Calculate entropy for each k-mer with sliding window
+		// ========================================================================
+		s := 0            // Number of words processed in current k-mer
+		sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
+		entropy := 1.0    // Current normalized entropy
+		cleaned := true   // Flag indicating if table has been cleaned
+
+		for i := range end {
+			s++
+
+			switch {
+			// CASE 1: Filling phase (fewer than nwords words collected)
+			case s < nwords:
+				cleaned = false
+				table[words[i]]++ // Increment word frequency
+
+			// CASE 2: Position contains an ambiguity
+			case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
+				entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
+				if !cleaned {
+					cleanTable(table, tableSize) // Reset table
+				}
+				cleaned = true
+				s = 0
+				sum_n_logn = 0.0
+
+			// CASE 3: First complete k-mer (s == nwords)
+			case s == nwords:
+				cleaned = false
+				table[words[i]]++
+
+				// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
+				// = log(N) - (1/N)*Σ n(i)*log(n(i))
+				// where N = nwords, n(i) = frequency of word i
+				//
+				// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
+				// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
+				// so we must scan the full table even though only ~10 entries will be non-zero
+				sum_n_logn = 0
+				for j := range tableSize {
+					n := float64(table[j])
+					if n > 0 {
+						sum_n_logn += n * math.Log(n)
+					}
+				}
+				// Normalized entropy = observed entropy / maximum entropy
+				entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
+
+			// CASE 4: Sliding window (s > nwords)
+			// Incremental update of entropy by adding a new word
+			// and removing the old one
+			case s > nwords:
+				cleaned = false
+
+				new_word := words[i]
+				old_word := words[i-nwords]
+
+				// Optimization: only recalculate if word changes
+				if old_word != new_word {
+					table[new_word]++
+					table[old_word]--
+
+					n_old := float64(table[old_word])
+					n_new := float64(table[new_word])
+
+					// Incremental update of sum_n_logn
+					// Remove contribution of old word (before decrement)
+					sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
+					// Add contribution of old word (after decrement)
+					if n_old > 0 {
+						sum_n_logn += n_old * math.Log(n_old)
+					}
+					// Add contribution of new word (after increment)
+					if n_new > 0 {
+						sum_n_logn += n_new * math.Log(n_new)
+					}
+					// Remove contribution of new word (before increment)
+					if n_new > 1 {
+						sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
+					}
+				}
+
+				entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
+			}
+
+			// Store entropy for position corresponding to start of k-mer
+			if s >= nwords && maskPositions[i-nwords+1] >= 0 {
+				if entropy == 0 {
+					log.Errorf("Zero entropy @ positon %d", i-nwords+1)
+				}
+				entropies[i-nwords+1] = entropy
+			}
+		}
+
+		// ========================================================================
+		// STEP 3: Apply sliding min filter
+		// ========================================================================
+		// Replace each entropy with minimum in window of size kmer_size
+		// This allows robust detection of low-complexity regions
+		slidingMin(entropies, kmer_size)
+		// log.Warnf("%v\n%v", e, entropies)
+	}
+
+	// ========================================================================
+	// FUNCTION 6: applyMaskMode - Apply masking to sequence
+	// ========================================================================
+	applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
+		// Create copy to avoid modifying original
+		seqCopy := sequence.Copy()
+		sequenceBytes := seqCopy.Sequence()
+
+		// Mask identified positions
+		for i := 0; i < len(sequenceBytes); i++ {
+			if maskPositions[i] {
+				// Operation &^ 32 converts to UPPERCASE (clears bit 5)
+				sequenceBytes[i] = sequenceBytes[i] &^ 32
+			}
+		}
+
+		return obiseq.BioSequenceSlice{seqCopy}, nil
+	}
+
+	// ========================================================================
+	// FUNCTION 7: masking - Main masking function
+	// ========================================================================
+	// Calculates entropies at all scales and masks positions
+	// whose minimum entropy is below the threshold.
+	masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
+		bseq := sequence.Sequence()
+
+		// Identify ambiguities
+		maskPositions := maskAmbiguities(bseq)
+
+		// Initialize data structures
+		mask := make([]int, len(bseq))          // Stores scale detecting minimum entropy
+		entropies := make([]float64, len(bseq)) // Minimum entropy at each position
+		for i := range entropies {
+			entropies[i] = 4.0 // Very high initial value
+		}
+
+		freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
+		words := make([]int, len(bseq))        // Buffer for encoded words
+
+		// ========================================================================
+		// Calculate entropy at maximum scale (level_max)
+		// ========================================================================
+		computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
+
+		// Initialize mask with level_max everywhere (except ambiguities)
+		for i := range bseq {
+			v := level_max
+			// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
+			// 	v = 0
+			// }
+			mask[i] = v
+		}
+
+		// ========================================================================
+		// Calculate entropy at lower scales
+		// ========================================================================
+		entropies2 := make([]float64, len(bseq))
+
+		for ws := level_max - 1; ws > 0; ws-- {
+			// *** WARNING: POTENTIAL BUG ***
+			// The parameter passed is level_max instead of ws!
+			// This means we always recalculate with the same scale
+			// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
+			computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
+			// Keep minimum entropy and corresponding scale
+			for i, e2 := range entropies2 {
+				if e2 < entropies[i] {
+					entropies[i] = e2
+					mask[i] = ws
+				}
+			}
+		}
+
+		// Force entropy to 0 for ambiguous positions
+		for i, nuc := range bseq {
+			if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
+				entropies[i] = 0
+			}
+		}
+
+		// ========================================================================
+		// Identify positions to mask
+		// ========================================================================
+		remove := make([]bool, len(entropies))
+		for i, e := range entropies {
+			remove[i] = e <= threshold
+		}
+
+		// Save metadata in sequence attributes
+		sequence.SetAttribute("mask", mask)
+		sequence.SetAttribute("Entropies", entropies)
+
+		return applyMaskMode(sequence, remove, maskChar)
+	}
+
+	return masking
+}
+
+// CLISequenceEntropyMasker creates an iterator that applies entropy masking
+// to all sequences in an input iterator.
+//
+// Uses command-line parameters to configure the worker.
+func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequence {
+	var newIter obiiter.IBioSequence
+
+	worker := LowMaskWorker(
+		CLIKmerSize(),
+		CLILevelMax(),
+		CLIThreshold(),
+		CLIMaskingMode(),
+		CLIMaskingChar(),
+	)
+
+	// Apply worker in parallel
+	newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
+
+	// Filter resulting empty sequences
+	return newIter.FilterEmpty()
+}
@@ -0,0 +1,72 @@
+package obilowmask
+
+import (
+	"strings"
+
+	"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
+	"github.com/DavidGamba/go-getoptions"
+
+	log "github.com/sirupsen/logrus"
+)
+
+var __kmer_size__ = 31
+var __level_max__ = 6
+var __threshold__ = 0.5
+var __split_mode__ = false
+var __mask__ = "."
+
+func LowMaskOptionSet(options *getoptions.GetOpt) {
+
+	options.IntVar(&__kmer_size__, "kmer-size", __kmer_size__,
+		options.Description("Size of the kmer considered to estimate entropy."),
+	)
+
+	options.IntVar(&__level_max__, "entropy_size", __level_max__,
+		options.Description("Maximum word size considered for entropy estimate"),
+	)
+
+	options.Float64Var(&__threshold__, "threshold", __threshold__,
+		options.Description("entropy theshold used to mask a kmer"),
+	)
+
+	options.BoolVar(&__split_mode__, "--split-mode", __split_mode__,
+		options.Description("in split mode, input sequences are splitted to remove masked regions"),
+	)
+
+	options.StringVar(&__mask__, "--masking-char", __mask__,
+		options.Description("Character used to mask low complexity region"),
+	)
+}
+
+func OptionSet(options *getoptions.GetOpt) {
+	LowMaskOptionSet(options)
+	obiconvert.InputOptionSet(options)
+}
+
+func CLIKmerSize() int {
+	return __kmer_size__
+}
+
+func CLILevelMax() int {
+	return __level_max__
+}
+
+func CLIThreshold() float64 {
+	return __threshold__
+}
+
+func CLIMaskingMode() MaskingMode {
+	if __split_mode__ {
+		return Split
+	} else {
+		return Mask
+	}
+}
+
+func CLIMaskingChar() byte {
+	mask := strings.TrimSpace(__mask__)
+	if len(mask) != 1 {
+		log.Fatalf("--masking-char option accept a single character, not %s", mask)
+	}
+	return []byte(mask)[0]
+}