From e681666aaad78d323bf95976402512d24e5963d8 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 24 Nov 2025 14:28:21 +0100 Subject: [PATCH] End of obilowmask --- pkg/obioptions/version.go | 2 +- pkg/obitools/obilowmask/obilowmask.go | 82 ++++++++++++++++++++++++++- pkg/obitools/obilowmask/options.go | 17 ++++-- 3 files changed, 95 insertions(+), 6 deletions(-) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index cfe9ddd..235595b 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "961abce" +var _Commit = "c1b9503" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obilowmask/obilowmask.go b/pkg/obitools/obilowmask/obilowmask.go index a75bca1..5a3bf63 100644 --- a/pkg/obitools/obilowmask/obilowmask.go +++ b/pkg/obitools/obilowmask/obilowmask.go @@ -1,6 +1,7 @@ package obilowmask import ( + "fmt" "math" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" @@ -17,6 +18,7 @@ type MaskingMode int const ( Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters Split // Split mode: split sequence into high-complexity fragments + Extract ) // LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. @@ -342,6 +344,76 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking return obiseq.BioSequenceSlice{seqCopy}, nil } + selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) { + rep := obiseq.NewBioSequenceSlice() + + inlow := false + fromlow := -1 + for i, masked := range maskPosition { + if masked && !inlow { + fromlow = i + inlow = true + } + if inlow && !masked { + if fromlow >= 0 { + frg, err := sequence.Subsequence(fromlow, i, false) + if err != nil { + return nil, err + } + rep.Push(frg) + } + inlow = false + fromlow = -1 + } + } + + // Handle the case where we end in a masked region + if inlow && fromlow >= 0 { + frg, err := sequence.Subsequence(fromlow, len(maskPosition), false) + if err != nil { + return nil, err + } + rep.Push(frg) + } + + return *rep, nil + } + + selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) { + rep := obiseq.NewBioSequenceSlice() + + inhigh := false + fromhigh := -1 + for i, masked := range maskPosition { + if !masked && !inhigh { + fromhigh = i + inhigh = true + } + if inhigh && masked { + if fromhigh >= 0 { + frg, err := sequence.Subsequence(fromhigh, i, false) + if err != nil { + return nil, err + } + rep.Push(frg) + } + inhigh = false + fromhigh = -1 + } + } + + // Handle the case where we end in an unmasked region + if inhigh && fromhigh >= 0 { + frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false) + if err != nil { + return nil, err + } + rep.Push(frg) + } + + return *rep, nil + } + // ======================================================================== // FUNCTION 7: masking - Main masking function // ======================================================================== @@ -425,7 +497,15 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking sequence.SetAttribute("mask", mask) sequence.SetAttribute("Entropies", entropies) - return applyMaskMode(sequence, remove, maskChar) + switch mode { + case Mask: + return applyMaskMode(sequence, remove, maskChar) + case Split: + return selectunmasked(sequence, remove) + case Extract: + return selectMasked(sequence, remove) + } + return nil, fmt.Errorf("Unknown mode %d", mode) } return masking diff --git a/pkg/obitools/obilowmask/options.go b/pkg/obitools/obilowmask/options.go index c9b2f50..30c1408 100644 --- a/pkg/obitools/obilowmask/options.go +++ b/pkg/obitools/obilowmask/options.go @@ -13,6 +13,7 @@ var __kmer_size__ = 31 var __level_max__ = 6 var __threshold__ = 0.5 var __split_mode__ = false +var __low_mode__ = false var __mask__ = "." func LowMaskOptionSet(options *getoptions.GetOpt) { @@ -29,11 +30,15 @@ func LowMaskOptionSet(options *getoptions.GetOpt) { options.Description("entropy theshold used to mask a kmer"), ) - options.BoolVar(&__split_mode__, "--split-mode", __split_mode__, + options.BoolVar(&__split_mode__, "split-mode", __split_mode__, options.Description("in split mode, input sequences are splitted to remove masked regions"), ) - options.StringVar(&__mask__, "--masking-char", __mask__, + options.BoolVar(&__low_mode__, "low-mode", __low_mode__, + options.Description("in split mode, input sequences are splitted to remove masked regions"), + ) + + options.StringVar(&__mask__, "masking-char", __mask__, options.Description("Character used to mask low complexity region"), ) } @@ -41,6 +46,7 @@ func LowMaskOptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) { LowMaskOptionSet(options) obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) } func CLIKmerSize() int { @@ -56,9 +62,12 @@ func CLIThreshold() float64 { } func CLIMaskingMode() MaskingMode { - if __split_mode__ { + switch { + case __low_mode__: + return Extract + case __split_mode__: return Split - } else { + default: return Mask } }