Add option to keep shorter fragments in lowmask

Add a new boolean option 'keep-shorter' to preserve fragments shorter than kmer-size during split/extract mode.

This change introduces a new flag _lowmaskKeepShorter that controls whether fragments
shorter than the kmer size should be kept during split/extract operations.

The implementation:
1. Adds the new boolean variable _lowmaskKeepShorter
2. Registers the command-line option "keep-shorter"
3. Updates the lowMaskWorker function signature to accept the keepShorter parameter
4. Modifies the fragment selection logic to check the keepShorter flag
5. Updates the worker creation to pass the global flag value

This allows users to control the behavior when dealing with short sequences in
split/extract modes, providing more flexibility in low-complexity masking.
This commit is contained in:
Eric Coissac
2026-02-10 09:36:32 +01:00
parent f2937af1ad
commit e775f7e256

View File

@@ -32,6 +32,7 @@ var _lowmaskThreshold = 0.5
var _lowmaskSplitMode = false
var _lowmaskLowMode = false
var _lowmaskMaskChar = "."
var _lowmaskKeepShorter = false
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
@@ -52,6 +53,9 @@ func LowMaskOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
options.Description("Character used to mask low complexity regions."))
options.BoolVar(&_lowmaskKeepShorter, "keep-shorter", _lowmaskKeepShorter,
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
}
func lowmaskMaskingMode() MaskingMode {
@@ -74,7 +78,7 @@ func lowmaskMaskingChar() byte {
}
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte) obiseq.SeqWorker {
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker {
nLogN := make([]float64, kmer_size+1)
for i := 1; i <= kmer_size; i++ {
@@ -301,11 +305,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
if inlow && !masked {
if fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, i, false)
if err != nil {
return nil, err
frgLen := i - fromlow
if keepShorter || frgLen >= kmer_size {
frg, err := sequence.Subsequence(fromlow, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
rep.Push(frg)
}
inlow = false
fromlow = -1
@@ -313,11 +320,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
if inlow && fromlow >= 0 {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
return nil, err
frgLen := len(maskPosition) - fromlow
if keepShorter || frgLen >= kmer_size {
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
rep.Push(frg)
}
return *rep, nil
@@ -335,11 +345,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
if inhigh && masked {
if fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, i, false)
if err != nil {
return nil, err
frgLen := i - fromhigh
if keepShorter || frgLen >= kmer_size {
frg, err := sequence.Subsequence(fromhigh, i, false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
rep.Push(frg)
}
inhigh = false
fromhigh = -1
@@ -347,11 +360,14 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
}
if inhigh && fromhigh >= 0 {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
return nil, err
frgLen := len(maskPosition) - fromhigh
if keepShorter || frgLen >= kmer_size {
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
if err != nil {
return nil, err
}
rep.Push(frg)
}
rep.Push(frg)
}
return *rep, nil
@@ -364,7 +380,15 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
for i := range remove {
remove[i] = true
}
return applyMaskMode(sequence, remove, maskChar)
switch mode {
case MaskMode:
return applyMaskMode(sequence, remove, maskChar)
case SplitMode:
return selectunmasked(sequence, remove)
case ExtractMode:
return selectMasked(sequence, remove)
}
return nil, fmt.Errorf("unknown mode %d", mode)
}
bseq := sequence.Sequence()
@@ -442,7 +466,7 @@ func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) erro
return fmt.Errorf("failed to open sequence files: %w", err)
}
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar)
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, _lowmaskKeepShorter)
masked := sequences.MakeIWorker(
worker,