Add option to keep shorter fragments in lowmask

Add a new boolean option 'keep-shorter' to preserve fragments shorter than kmer-size during split/extract mode.

This change introduces a new flag _lowmaskKeepShorter that controls whether fragments
shorter than the kmer size should be kept during split/extract operations.

The implementation:
1. Adds the new boolean variable _lowmaskKeepShorter
2. Registers the command-line option "keep-shorter"
3. Updates the lowMaskWorker function signature to accept the keepShorter parameter
4. Modifies the fragment selection logic to check the keepShorter flag
5. Updates the worker creation to pass the global flag value

This allows users to control the behavior when dealing with short sequences in
split/extract modes, providing more flexibility in low-complexity masking.
This commit is contained in:
Eric Coissac
2026-02-10 09:36:32 +01:00
parent f2937af1ad
commit e775f7e256

View File

@@ -32,6 +32,7 @@ var _lowmaskThreshold = 0.5
var _lowmaskSplitMode = false var _lowmaskSplitMode = false
var _lowmaskLowMode = false var _lowmaskLowMode = false
var _lowmaskMaskChar = "." var _lowmaskMaskChar = "."
var _lowmaskKeepShorter = false
// LowMaskOptionSet registers options specific to low-complexity masking. // LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) { func LowMaskOptionSet(options *getoptions.GetOpt) {
@@ -52,6 +53,9 @@ func LowMaskOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar, options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
options.Description("Character used to mask low complexity regions.")) options.Description("Character used to mask low complexity regions."))
options.BoolVar(&_lowmaskKeepShorter, "keep-shorter", _lowmaskKeepShorter,
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
} }
func lowmaskMaskingMode() MaskingMode { func lowmaskMaskingMode() MaskingMode {
@@ -74,7 +78,7 @@ func lowmaskMaskingChar() byte {
} }
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. // lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker { func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker {
nLogN := make([]float64, kmer_size+1) nLogN := make([]float64, kmer_size+1)
for i := 1; i <= kmer_size; i++ { for i := 1; i <= kmer_size; i++ {
@@ -301,11 +305,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
} }
if inlow && !masked { if inlow && !masked {
if fromlow >= 0 { if fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, i, false) frgLen := i - fromlow
if err != nil { if keepShorter || frgLen >= kmer_size {
return nil, err frg, err := sequence.Subsequence(fromlow, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
} }
rep.Push(frg)
} }
inlow = false inlow = false
fromlow = -1 fromlow = -1
@@ -313,11 +320,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
} }
if inlow && fromlow >= 0 { if inlow && fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false) frgLen := len(maskPosition) - fromlow
if err != nil { if keepShorter || frgLen >= kmer_size {
return nil, err frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
} }
rep.Push(frg)
} }
return *rep, nil return *rep, nil
@@ -335,11 +345,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
} }
if inhigh && masked { if inhigh && masked {
if fromhigh >= 0 { if fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, i, false) frgLen := i - fromhigh
if err != nil { if keepShorter || frgLen >= kmer_size {
return nil, err frg, err := sequence.Subsequence(fromhigh, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
} }
rep.Push(frg)
} }
inhigh = false inhigh = false
fromhigh = -1 fromhigh = -1
@@ -347,11 +360,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
} }
if inhigh && fromhigh >= 0 { if inhigh && fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false) frgLen := len(maskPosition) - fromhigh
if err != nil { if keepShorter || frgLen >= kmer_size {
return nil, err frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
} }
rep.Push(frg)
} }
return *rep, nil return *rep, nil
@@ -364,7 +380,15 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
for i := range remove { for i := range remove {
remove[i] = true remove[i] = true
} }
return applyMaskMode(sequence, remove, maskChar) switch mode {
case MaskMode:
return applyMaskMode(sequence, remove, maskChar)
case SplitMode:
return selectunmasked(sequence, remove)
case ExtractMode:
return selectMasked(sequence, remove)
}
return nil, fmt.Errorf("unknown mode %d", mode)
} }
bseq := sequence.Sequence() bseq := sequence.Sequence()
@@ -442,7 +466,7 @@ func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) erro
return fmt.Errorf("failed to open sequence files: %w", err) return fmt.Errorf("failed to open sequence files: %w", err)
} }
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar) worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, _lowmaskKeepShorter)
masked := sequences.MakeIWorker( masked := sequences.MakeIWorker(
worker, worker,