mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Optimize low-complexity masking algorithm
This commit optimizes the low-complexity masking algorithm by: 1. Precomputing logarithm values and normalization tables to avoid repeated calculations 2. Replacing the MinMultiset-based sliding minimum with a more efficient deque-based implementation 3. Improving entropy calculation by using precomputed n*log(n) values 4. Simplifying the circular normalization process with precomputed tables 5. Removing unused imports and log statements The changes significantly improve performance while maintaining the same masking behavior.
This commit is contained in:
@@ -8,8 +8,6 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// MaskingMode defines how to handle low-complexity regions
|
||||
@@ -37,62 +35,81 @@ const (
|
||||
// Returns: a SeqWorker function that can be applied to each sequence
|
||||
func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 1: emax - Calculate theoretical maximum entropy
|
||||
// ========================================================================
|
||||
// Computes the maximum entropy of a k-mer of length lseq containing words of size word_size.
|
||||
//
|
||||
// Maximum entropy depends on the theoretical optimal word distribution:
|
||||
// - If we have more positions (nw) than possible canonical words (na),
|
||||
// some words will appear multiple times
|
||||
// - We calculate the entropy of a distribution where all words appear
|
||||
// cov or cov+1 times (most uniform distribution possible)
|
||||
//
|
||||
// IMPORTANT: Uses CanonicalCircularKmerCount to get the actual number of canonical words
|
||||
// after circular normalization (e.g., "atg", "tga", "gat" → all "atg").
|
||||
// This is much smaller than 4^word_size (e.g., 10 instead of 16 for word_size=2).
|
||||
emax := func(lseq, word_size int) float64 {
|
||||
nw := lseq - word_size + 1 // Number of words in a k-mer of length lseq
|
||||
na := obikmer.CanonicalCircularKmerCount(word_size) // Number of canonical words after normalization
|
||||
|
||||
// Case 1: Fewer positions than possible words
|
||||
// Maximum entropy is simply log(nw) since we can have at most nw different words
|
||||
if nw < na {
|
||||
return math.Log(float64(nw))
|
||||
nLogN := make([]float64, kmer_size+1)
|
||||
for i := 1; i <= kmer_size; i++ {
|
||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||
}
|
||||
|
||||
// Case 2: More positions than possible words
|
||||
// Some words must appear multiple times
|
||||
cov := nw / na // Average coverage (average number of occurrences per word)
|
||||
remains := nw - (na * cov) // Number of words that will have one additional occurrence
|
||||
normTables := make([][]int, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
size := 1 << (ws * 2)
|
||||
normTables[ws] = make([]int, size)
|
||||
for code := 0; code < size; code++ {
|
||||
normTables[ws][code] = int(obikmer.NormalizeCircular(uint64(code), ws))
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate frequencies in the optimal distribution:
|
||||
// - (na - remains) words appear cov times → frequency f1 = cov/nw
|
||||
// - remains words appear (cov+1) times → frequency f2 = (cov+1)/nw
|
||||
type pair struct {
|
||||
index int
|
||||
value float64
|
||||
}
|
||||
|
||||
slidingMin := func(data []float64, window int) {
|
||||
if len(data) == 0 || window <= 0 {
|
||||
return
|
||||
}
|
||||
if window >= len(data) {
|
||||
minVal := data[0]
|
||||
for i := 1; i < len(data); i++ {
|
||||
if data[i] < minVal {
|
||||
minVal = data[i]
|
||||
}
|
||||
}
|
||||
for i := range data {
|
||||
data[i] = minVal
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
deque := make([]pair, 0, window)
|
||||
|
||||
for i, v := range data {
|
||||
for len(deque) > 0 && deque[0].index <= i-window {
|
||||
deque = deque[1:]
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].value >= v {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, pair{index: i, value: v})
|
||||
|
||||
data[i] = deque[0].value
|
||||
}
|
||||
}
|
||||
emaxValues := make([]float64, level_max+1)
|
||||
logNwords := make([]float64, level_max+1)
|
||||
for ws := 1; ws <= level_max; ws++ {
|
||||
nw := kmer_size - ws + 1
|
||||
na := obikmer.CanonicalCircularKmerCount(ws)
|
||||
if nw < na {
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = math.Log(float64(nw))
|
||||
} else {
|
||||
cov := nw / na
|
||||
remains := nw - (na * cov)
|
||||
f1 := float64(cov) / float64(nw)
|
||||
f2 := float64(cov+1) / float64(nw)
|
||||
|
||||
// Shannon entropy: H = -Σ p(i) * log(p(i))
|
||||
// where p(i) is the probability of observing word i
|
||||
return -(float64(na-remains)*f1*math.Log(f1) +
|
||||
logNwords[ws] = math.Log(float64(nw))
|
||||
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||
float64(remains)*f2*math.Log(f2))
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 2: maskAmbiguities - Mark positions containing ambiguities
|
||||
// ========================================================================
|
||||
// Identifies positions with ambiguous nucleotides (N, Y, R, etc.) and marks
|
||||
// all k-mers that contain them.
|
||||
//
|
||||
// Returns: a slice where maskPositions[i] = -1 if position i is part of a
|
||||
// k-mer containing an ambiguity, 0 otherwise
|
||||
maskAmbiguities := func(sequence []byte) []int {
|
||||
maskPositions := make([]int, len(sequence))
|
||||
for i, nuc := range sequence {
|
||||
// If nucleotide is not a, c, g or t (lowercase), it's an ambiguity
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
// Mark all positions of k-mers that contain this nucleotide
|
||||
// A k-mer starting at position (i - kmer_size + 1) will contain position i
|
||||
end := max(0, i-kmer_size+1)
|
||||
for j := i; j >= end; j-- {
|
||||
maskPositions[j] = -1
|
||||
@@ -102,182 +119,87 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
return maskPositions
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 3: cleanTable - Reset a frequency table to zero
|
||||
// ========================================================================
|
||||
cleanTable := func(table []int, over int) {
|
||||
for i := 0; i < over; i++ {
|
||||
table[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 4: slidingMin - Calculate sliding minimum over a window
|
||||
// ========================================================================
|
||||
// Applies a sliding window of size window over data and replaces each
|
||||
// value with the minimum in the window centered on that position.
|
||||
//
|
||||
// Uses a MinMultiset to efficiently maintain the minimum in the window.
|
||||
slidingMin := func(data []float64, window int) {
|
||||
minimier := obiutils.NewMinMultiset(func(a, b float64) bool { return a < b })
|
||||
ldata := len(data)
|
||||
mem := make([]float64, window) // Circular buffer to store window values
|
||||
|
||||
// Initialize buffer with sentinel value
|
||||
for i := range mem {
|
||||
mem[i] = 10000
|
||||
}
|
||||
|
||||
for i, v := range data {
|
||||
// Get the old value leaving the window
|
||||
m := mem[i%window]
|
||||
mem[i%window] = v
|
||||
|
||||
// Remove old value from multiset if it was valid
|
||||
if m < 10000 {
|
||||
minimier.RemoveOne(m)
|
||||
}
|
||||
|
||||
// Add new value if full window is ahead of us
|
||||
if (ldata - i) >= window {
|
||||
minimier.Add(v)
|
||||
}
|
||||
|
||||
// log.Warnf("taille du minimier %d @ %d", minimier.Len(), i)
|
||||
|
||||
// Retrieve and store current minimum
|
||||
var ok bool
|
||||
if data[i], ok = minimier.Min(); !ok {
|
||||
log.Error("problem with minimum entropy")
|
||||
data[i] = 0.0
|
||||
}
|
||||
|
||||
//xx, _ := minimier.Min()
|
||||
//log.Warnf("Pos: %d n: %d min: %.3f -> %.3f", i, minimier.Len(), v, xx)
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 5: computeEntropies - Calculate normalized entropy for each position
|
||||
// ========================================================================
|
||||
// This is the central function that calculates the entropy of each k-mer in the sequence
|
||||
// at a given scale (wordSize).
|
||||
//
|
||||
// Algorithm:
|
||||
// 1. Encode the sequence into words (subsequences of size wordSize)
|
||||
// 2. For each k-mer, count the frequencies of words it contains
|
||||
// 3. Calculate normalized entropy = observed_entropy / maximum_entropy
|
||||
// 4. Apply a sliding min filter to smooth results
|
||||
//
|
||||
// IMPORTANT: Line 147 uses NormalizeInt for circular normalization of words!
|
||||
// This means "atg", "tga", and "gat" are considered the same word.
|
||||
computeEntropies := func(sequence []byte,
|
||||
maskPositions []int, // Positions of ambiguities
|
||||
entropies []float64, // Output: normalized entropies for each position
|
||||
table []int, // Frequency table for words (reused between calls)
|
||||
words []int, // Buffer to store encoded words (reused)
|
||||
wordSize int) { // Word size (scale of analysis)
|
||||
maskPositions []int,
|
||||
entropies []float64,
|
||||
table []int,
|
||||
words []int,
|
||||
wordSize int,
|
||||
normTable []int) {
|
||||
|
||||
lseq := len(sequence) // Sequence length
|
||||
tableSize := 1 << (wordSize * 2) // Actual table size (must fit all codes 0 to 4^wordSize-1)
|
||||
nwords := kmer_size - wordSize + 1 // Number of words in a k-mer
|
||||
lseq := len(sequence)
|
||||
tableSize := 1 << (wordSize * 2)
|
||||
nwords := kmer_size - wordSize + 1
|
||||
float_nwords := float64(nwords)
|
||||
log_nwords := math.Log(float_nwords) // log(nwords) used in entropy calculation
|
||||
entropyMax := emax(kmer_size, wordSize) // Theoretical maximum entropy (uses CanonicalKmerCount internally)
|
||||
log_nwords := logNwords[wordSize]
|
||||
entropyMax := emaxValues[wordSize]
|
||||
|
||||
// Reset frequency table (must clear entire table, not just nalpha entries)
|
||||
cleanTable(table, tableSize)
|
||||
|
||||
for i := 1; i < lseq; i++ {
|
||||
entropies[i] = 6
|
||||
}
|
||||
end := lseq - wordSize + 1 // Last position where a word can start
|
||||
end := lseq - wordSize + 1
|
||||
|
||||
// ========================================================================
|
||||
// STEP 1: Encode all words in the sequence
|
||||
// ========================================================================
|
||||
// Uses left-shift encoding: each nucleotide is encoded on 2 bits
|
||||
// a=00, c=01, g=10, t=11
|
||||
mask := (1 << (wordSize * 2)) - 1
|
||||
|
||||
mask := (1 << (wordSize * 2)) - 1 // Mask to keep only last wordSize*2 bits
|
||||
|
||||
// Initialize first word (all nucleotides except the last one)
|
||||
word_index := 0
|
||||
for i := 0; i < wordSize-1; i++ {
|
||||
word_index = (word_index << 2) + int(obikmer.EncodeNucleotide(sequence[i]))
|
||||
}
|
||||
|
||||
// Encode all words with sliding window
|
||||
for i, j := 0, wordSize-1; i < end; i, j = i+1, j+1 {
|
||||
// Shift left by 2 bits, mask, and add new nucleotide
|
||||
word_index = ((word_index << 2) & mask) + int(obikmer.EncodeNucleotide(sequence[j]))
|
||||
|
||||
// *** CIRCULAR NORMALIZATION ***
|
||||
// Convert word to its canonical form (smallest by circular rotation)
|
||||
// This is where "atg", "tga", "gat" all become "atg"
|
||||
// Now using uint64-based NormalizeCircular for better performance
|
||||
words[i] = int(obikmer.NormalizeCircular(uint64(word_index), wordSize))
|
||||
words[i] = normTable[word_index]
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// STEP 2: Calculate entropy for each k-mer with sliding window
|
||||
// ========================================================================
|
||||
s := 0 // Number of words processed in current k-mer
|
||||
sum_n_logn := 0.0 // Sum of n*log(n) for entropy calculation
|
||||
entropy := 1.0 // Current normalized entropy
|
||||
cleaned := true // Flag indicating if table has been cleaned
|
||||
s := 0
|
||||
sum_n_logn := 0.0
|
||||
entropy := 1.0
|
||||
cleaned := true
|
||||
|
||||
for i := range end {
|
||||
s++
|
||||
|
||||
switch {
|
||||
// CASE 1: Filling phase (fewer than nwords words collected)
|
||||
case s < nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++ // Increment word frequency
|
||||
table[words[i]]++
|
||||
|
||||
// CASE 2: Position contains an ambiguity
|
||||
case i >= (nwords-1) && maskPositions[i-nwords+1] < 0:
|
||||
entropies[i-nwords+1] = 4.0 // Mark entropy as invalid
|
||||
entropies[i-nwords+1] = 4.0
|
||||
if !cleaned {
|
||||
cleanTable(table, tableSize) // Reset table
|
||||
cleanTable(table, tableSize)
|
||||
}
|
||||
cleaned = true
|
||||
s = 0
|
||||
sum_n_logn = 0.0
|
||||
|
||||
// CASE 3: First complete k-mer (s == nwords)
|
||||
case s == nwords:
|
||||
cleaned = false
|
||||
table[words[i]]++
|
||||
|
||||
// Calculate Shannon entropy: H = -Σ p(i)*log(p(i))
|
||||
// = log(N) - (1/N)*Σ n(i)*log(n(i))
|
||||
// where N = nwords, n(i) = frequency of word i
|
||||
//
|
||||
// NOTE: We iterate over entire table (tableSize = 4^wordSize) to count all frequencies.
|
||||
// Canonical codes are not contiguous (e.g., for k=2: {0,1,2,3,5,6,7,10,11,15})
|
||||
// so we must scan the full table even though only ~10 entries will be non-zero
|
||||
sum_n_logn = 0
|
||||
for j := range tableSize {
|
||||
n := float64(table[j])
|
||||
if n > 0 {
|
||||
sum_n_logn += n * math.Log(n)
|
||||
sum_n_logn += nLogN[int(n)]
|
||||
}
|
||||
}
|
||||
// Normalized entropy = observed entropy / maximum entropy
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
|
||||
// CASE 4: Sliding window (s > nwords)
|
||||
// Incremental update of entropy by adding a new word
|
||||
// and removing the old one
|
||||
case s > nwords:
|
||||
cleaned = false
|
||||
|
||||
new_word := words[i]
|
||||
old_word := words[i-nwords]
|
||||
|
||||
// Optimization: only recalculate if word changes
|
||||
if old_word != new_word {
|
||||
table[new_word]++
|
||||
table[old_word]--
|
||||
@@ -285,59 +207,39 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
n_old := float64(table[old_word])
|
||||
n_new := float64(table[new_word])
|
||||
|
||||
// Incremental update of sum_n_logn
|
||||
// Remove contribution of old word (before decrement)
|
||||
sum_n_logn -= (n_old + 1) * math.Log(n_old+1)
|
||||
// Add contribution of old word (after decrement)
|
||||
sum_n_logn -= nLogN[int(n_old+1)]
|
||||
if n_old > 0 {
|
||||
sum_n_logn += n_old * math.Log(n_old)
|
||||
sum_n_logn += nLogN[int(n_old)]
|
||||
}
|
||||
// Add contribution of new word (after increment)
|
||||
if n_new > 0 {
|
||||
sum_n_logn += n_new * math.Log(n_new)
|
||||
sum_n_logn += nLogN[int(n_new)]
|
||||
}
|
||||
// Remove contribution of new word (before increment)
|
||||
if n_new > 1 {
|
||||
sum_n_logn -= (n_new - 1) * math.Log(n_new-1)
|
||||
sum_n_logn -= nLogN[int(n_new-1)]
|
||||
}
|
||||
}
|
||||
|
||||
entropy = (log_nwords - sum_n_logn/float_nwords) / entropyMax
|
||||
}
|
||||
|
||||
// Store entropy for position corresponding to start of k-mer
|
||||
if s >= nwords && maskPositions[i-nwords+1] >= 0 {
|
||||
if entropy < 0 {
|
||||
entropy = 0
|
||||
|
||||
}
|
||||
entropy = math.Round(entropy*10000) / 10000
|
||||
entropies[i-nwords+1] = entropy
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// STEP 3: Apply sliding min filter
|
||||
// ========================================================================
|
||||
// Replace each entropy with minimum in window of size kmer_size
|
||||
// This allows robust detection of low-complexity regions
|
||||
slidingMin(entropies, kmer_size)
|
||||
// log.Warnf("%v\n%v", e, entropies)
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 6: applyMaskMode - Apply masking to sequence
|
||||
// ========================================================================
|
||||
applyMaskMode := func(sequence *obiseq.BioSequence, maskPositions []bool, mask byte) (obiseq.BioSequenceSlice, error) {
|
||||
// Create copy to avoid modifying original
|
||||
seqCopy := sequence.Copy()
|
||||
sequenceBytes := seqCopy.Sequence()
|
||||
|
||||
// Mask identified positions
|
||||
for i := range sequenceBytes {
|
||||
if maskPositions[i] {
|
||||
// Operation &^ 32 converts to UPPERCASE (clears bit 5)
|
||||
// sequenceBytes[i] = sequenceBytes[i] &^ 32
|
||||
sequenceBytes[i] = mask
|
||||
}
|
||||
}
|
||||
@@ -368,7 +270,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the case where we end in a masked region
|
||||
if inlow && fromlow >= 0 {
|
||||
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
||||
if err != nil {
|
||||
@@ -403,7 +304,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the case where we end in an unmasked region
|
||||
if inhigh && fromhigh >= 0 {
|
||||
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
||||
if err != nil {
|
||||
@@ -415,11 +315,6 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
return *rep, nil
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// FUNCTION 7: masking - Main masking function
|
||||
// ========================================================================
|
||||
// Calculates entropies at all scales and masks positions
|
||||
// whose minimum entropy is below the threshold.
|
||||
masking := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
if sequence.Len() < kmer_size {
|
||||
sequence.SetAttribute("obilowmask_error", "Sequence too short")
|
||||
@@ -432,45 +327,27 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
|
||||
bseq := sequence.Sequence()
|
||||
|
||||
// Identify ambiguities
|
||||
maskPositions := maskAmbiguities(bseq)
|
||||
|
||||
// Initialize data structures
|
||||
mask := make([]int, len(bseq)) // Stores scale detecting minimum entropy
|
||||
entropies := make([]float64, len(bseq)) // Minimum entropy at each position
|
||||
mask := make([]int, len(bseq))
|
||||
entropies := make([]float64, len(bseq))
|
||||
for i := range entropies {
|
||||
entropies[i] = 4.0 // Very high initial value
|
||||
entropies[i] = 4.0
|
||||
}
|
||||
|
||||
freqs := make([]int, 1<<(2*level_max)) // Frequency table (max size)
|
||||
words := make([]int, len(bseq)) // Buffer for encoded words
|
||||
freqs := make([]int, 1<<(2*level_max))
|
||||
words := make([]int, len(bseq))
|
||||
entropies2 := make([]float64, len(bseq))
|
||||
|
||||
// ========================================================================
|
||||
// Calculate entropy at maximum scale (level_max)
|
||||
// ========================================================================
|
||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max)
|
||||
computeEntropies(bseq, maskPositions, entropies, freqs, words, level_max, normTables[level_max])
|
||||
|
||||
// Initialize mask with level_max everywhere (except ambiguities)
|
||||
for i := range bseq {
|
||||
v := level_max
|
||||
// if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
// v = 0
|
||||
// }
|
||||
mask[i] = v
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Calculate entropy at lower scales
|
||||
// ========================================================================
|
||||
entropies2 := make([]float64, len(bseq))
|
||||
|
||||
for ws := level_max - 1; ws > 0; ws-- {
|
||||
// *** WARNING: POTENTIAL BUG ***
|
||||
// The parameter passed is level_max instead of ws!
|
||||
// This means we always recalculate with the same scale
|
||||
// Should be: computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws)
|
||||
// Keep minimum entropy and corresponding scale
|
||||
computeEntropies(bseq, maskPositions, entropies2, freqs, words, ws, normTables[ws])
|
||||
for i, e2 := range entropies2 {
|
||||
if e2 < entropies[i] {
|
||||
entropies[i] = e2
|
||||
@@ -479,22 +356,17 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
||||
}
|
||||
}
|
||||
|
||||
// Force entropy to 0 for ambiguous positions
|
||||
for i, nuc := range bseq {
|
||||
if nuc != 'a' && nuc != 'c' && nuc != 'g' && nuc != 't' {
|
||||
entropies[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Identify positions to mask
|
||||
// ========================================================================
|
||||
remove := make([]bool, len(entropies))
|
||||
for i, e := range entropies {
|
||||
remove[i] = e <= threshold
|
||||
}
|
||||
|
||||
// Save metadata in sequence attributes
|
||||
sequence.SetAttribute("mask", mask)
|
||||
sequence.SetAttribute("Entropies", entropies)
|
||||
|
||||
@@ -527,9 +399,7 @@ func CLISequenceEntropyMasker(iterator obiiter.IBioSequence) obiiter.IBioSequenc
|
||||
CLIMaskingChar(),
|
||||
)
|
||||
|
||||
// Apply worker in parallel
|
||||
newIter = iterator.MakeIWorker(worker, false, obidefault.ParallelWorkers())
|
||||
|
||||
// Filter resulting empty sequences
|
||||
return newIter.FilterEmpty()
|
||||
}
|
||||
|
||||
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
40
pkg/obitools/obilowmask/obilowmask_test.go
Normal file
@@ -0,0 +1,40 @@
|
||||
package obilowmask
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
func TestLowMaskWorker(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtacgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
|
||||
resultSeq := result[0]
|
||||
if resultSeq.Len() != 32 {
|
||||
t.Fatalf("Expected sequence length 32, got %d", resultSeq.Len())
|
||||
}
|
||||
}
|
||||
|
||||
func TestLowMaskWorkerWithAmbiguity(t *testing.T) {
|
||||
worker := LowMaskWorker(31, 6, 0.3, Mask, 'n')
|
||||
|
||||
seq := obiseq.NewBioSequence("test", []byte("acgtNcgtacgtacgtacgtacgtacgtacgt"), "test")
|
||||
result, err := worker(seq)
|
||||
if err != nil {
|
||||
t.Fatalf("Worker failed: %v", err)
|
||||
}
|
||||
|
||||
if result.Len() != 1 {
|
||||
t.Fatalf("Expected 1 sequence, got %d", result.Len())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user