Merge pull request #64 from metabarcoding/push-yurwulsmpxkq

End of obilowmask
This commit is contained in:
coissac
2025-11-24 15:36:20 +01:00
committed by GitHub
3 changed files with 94 additions and 6 deletions

View File

@@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "961abce"
var _Commit = "c1b9503"
var _Version = "Release 4.4.0"
// Version returns the version of the obitools package.

View File

@@ -1,6 +1,7 @@
package obilowmask
import (
"fmt"
"math"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
@@ -17,6 +18,7 @@ type MaskingMode int
const (
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
Split // Split mode: split sequence into high-complexity fragments
Extract
)
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
@@ -342,6 +344,76 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
return obiseq.BioSequenceSlice{seqCopy}, nil
}
selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
rep := obiseq.NewBioSequenceSlice()
inlow := false
fromlow := -1
for i, masked := range maskPosition {
if masked && !inlow {
fromlow = i
inlow = true
}
if inlow && !masked {
if fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
inlow = false
fromlow = -1
}
}
// Handle the case where we end in a masked region
if inlow && fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
return *rep, nil
}
selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
rep := obiseq.NewBioSequenceSlice()
inhigh := false
fromhigh := -1
for i, masked := range maskPosition {
if !masked && !inhigh {
fromhigh = i
inhigh = true
}
if inhigh && masked {
if fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
inhigh = false
fromhigh = -1
}
}
// Handle the case where we end in an unmasked region
if inhigh && fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
return *rep, nil
}
// ========================================================================
// FUNCTION 7: masking - Main masking function
// ========================================================================
@@ -425,7 +497,15 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
sequence.SetAttribute("mask", mask)
sequence.SetAttribute("Entropies", entropies)
return applyMaskMode(sequence, remove, maskChar)
switch mode {
case Mask:
return applyMaskMode(sequence, remove, maskChar)
case Split:
return selectunmasked(sequence, remove)
case Extract:
return selectMasked(sequence, remove)
}
return nil, fmt.Errorf("Unknown mode %d", mode)
}
return masking

View File

@@ -13,6 +13,7 @@ var __kmer_size__ = 31
var __level_max__ = 6
var __threshold__ = 0.5
var __split_mode__ = false
var __low_mode__ = false
var __mask__ = "."
func LowMaskOptionSet(options *getoptions.GetOpt) {
@@ -29,11 +30,15 @@ func LowMaskOptionSet(options *getoptions.GetOpt) {
options.Description("entropy theshold used to mask a kmer"),
)
options.BoolVar(&__split_mode__, "--split-mode", __split_mode__,
options.BoolVar(&__split_mode__, "split-mode", __split_mode__,
options.Description("in split mode, input sequences are splitted to remove masked regions"),
)
options.StringVar(&__mask__, "--masking-char", __mask__,
options.BoolVar(&__low_mode__, "low-mode", __low_mode__,
options.Description("in split mode, input sequences are splitted to remove masked regions"),
)
options.StringVar(&__mask__, "masking-char", __mask__,
options.Description("Character used to mask low complexity region"),
)
}
@@ -56,9 +61,12 @@ func CLIThreshold() float64 {
}
func CLIMaskingMode() MaskingMode {
if __split_mode__ {
switch {
case __low_mode__:
return Extract
case __split_mode__:
return Split
} else {
default:
return Mask
}
}