Refactor lowmask options and shared kmer options

Refactor lowmask options to use shared kmer options and CLI getters

This commit refactors the lowmask subcommand to use shared kmer options and CLI getters instead of local variables. It also moves the kmer size and minimizer size options to a shared location and adds new CLI getters for the lowmask options.

- Move kmer size and minimizer size options to shared location
- Add CLI getters for lowmask options
- Refactor lowmask to use CLI getters
- Remove unused strings import
- Add MaskingMode type and related functions
This commit is contained in:
Eric Coissac
2026-02-10 09:52:28 +01:00
parent e775f7e256
commit 9babcc0fae
3 changed files with 121 additions and 93 deletions

View File

@@ -4,7 +4,6 @@ import (
"context"
"fmt"
"math"
"strings"
log "github.com/sirupsen/logrus"
@@ -16,67 +15,6 @@ import (
"github.com/DavidGamba/go-getoptions"
)
// MaskingMode defines how to handle low-complexity regions
type MaskingMode int
const (
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
SplitMode // Split sequence into high-complexity fragments
ExtractMode // Extract low-complexity fragments
)
// Lowmask-specific option variables (separate from index/super kmer-size).
var _lowmaskKmerSize = 31
var _lowmaskLevelMax = 6
var _lowmaskThreshold = 0.5
var _lowmaskSplitMode = false
var _lowmaskLowMode = false
var _lowmaskMaskChar = "."
var _lowmaskKeepShorter = false
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize,
options.Description("Size of the kmer considered to estimate entropy."))
options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax,
options.Description("Maximum word size considered for entropy estimate."))
options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold,
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode,
options.Description("Split sequences to remove masked regions."))
options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode,
options.Description("Extract only low-complexity regions."))
options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar,
options.Description("Character used to mask low complexity regions."))
options.BoolVar(&_lowmaskKeepShorter, "keep-shorter", _lowmaskKeepShorter,
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
}
func lowmaskMaskingMode() MaskingMode {
switch {
case _lowmaskLowMode:
return ExtractMode
case _lowmaskSplitMode:
return SplitMode
default:
return MaskMode
}
}
func lowmaskMaskingChar() byte {
mask := strings.TrimSpace(_lowmaskMaskChar)
if len(mask) != 1 {
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
}
return []byte(mask)[0]
}
// lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker {
@@ -453,11 +391,11 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
// runLowmask implements the "obik lowmask" subcommand.
// It masks low-complexity regions in DNA sequences using entropy-based detection.
func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
kmerSize := _lowmaskKmerSize
levelMax := _lowmaskLevelMax
threshold := _lowmaskThreshold
mode := lowmaskMaskingMode()
maskChar := lowmaskMaskingChar()
kmerSize := CLIKmerSize()
levelMax := CLIEntropySize()
threshold := CLIEntropyThreshold()
mode := CLIMaskingMode()
maskChar := CLIMaskingChar()
log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold)
@@ -466,7 +404,7 @@ func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) erro
return fmt.Errorf("failed to open sequence files: %w", err)
}
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, _lowmaskKeepShorter)
worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter())
masked := sequences.MakeIWorker(
worker,

View File

@@ -11,6 +11,15 @@ import (
"github.com/DavidGamba/go-getoptions"
)
// MaskingMode defines how to handle low-complexity regions
type MaskingMode int
const (
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
SplitMode // Split sequence into high-complexity fragments
ExtractMode // Extract low-complexity fragments
)
// Output format flags
var _jsonOutput bool
var _csvOutput bool
@@ -29,11 +38,66 @@ var _jaccard bool
var _setMetaTags = make(map[string]string, 0)
// ==============================
// Kmer index building options (moved from obikindex)
// Shared kmer options (used by index, super, lowmask)
// ==============================
var _kmerSize = 31
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
// KmerSizeOptionSet registers --kmer-size / -k.
// Shared by index, super, and lowmask subcommands.
func KmerSizeOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between 2 and 31)."))
}
// MinimizerOptionSet registers --minimizer-size / -m.
// Shared by index and super subcommands.
func MinimizerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
}
// ==============================
// Lowmask-specific options
// ==============================
var _entropySize = 6
var _entropyThreshold = 0.5
var _splitMode = false
var _extractMode = false
var _maskingChar = "."
var _keepShorter = false
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
KmerSizeOptionSet(options)
options.IntVar(&_entropySize, "entropy-size", _entropySize,
options.Description("Maximum word size considered for entropy estimate."))
options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold,
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
options.BoolVar(&_splitMode, "extract-high", _splitMode,
options.Description("Extract only high-complexity regions."))
options.BoolVar(&_extractMode, "extract-low", _extractMode,
options.Description("Extract only low-complexity regions."))
options.StringVar(&_maskingChar, "masking-char", _maskingChar,
options.Description("Character used to mask low complexity regions."))
options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter,
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
}
// ==============================
// Index-specific options
// ==============================
var _indexId = ""
var _metadataFormat = "toml"
var _setTag = make(map[string]string, 0)
@@ -44,13 +108,8 @@ var _saveFreqKmer = 0
// KmerIndexOptionSet defines every option related to kmer index building.
func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between 2 and 31)."))
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
KmerSizeOptionSet(options)
MinimizerOptionSet(options)
options.StringVar(&_indexId, "index-id", _indexId,
options.Description("Identifier for the kmer index."))
@@ -76,6 +135,16 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
}
// ==============================
// Super kmer options
// ==============================
// SuperKmerOptionSet registers options specific to super k-mer extraction.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
KmerSizeOptionSet(options)
MinimizerOptionSet(options)
}
// CLIKmerSize returns the k-mer size.
func CLIKmerSize() int {
return _kmerSize
@@ -157,6 +226,42 @@ func SetMinOccurrence(n int) {
_minOccurrence = n
}
// CLIMaskingMode returns the masking mode from CLI flags.
func CLIMaskingMode() MaskingMode {
switch {
case _extractMode:
return ExtractMode
case _splitMode:
return SplitMode
default:
return MaskMode
}
}
// CLIMaskingChar returns the masking character, validated.
func CLIMaskingChar() byte {
mask := strings.TrimSpace(_maskingChar)
if len(mask) != 1 {
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
}
return []byte(mask)[0]
}
// CLIEntropySize returns the entropy word size.
func CLIEntropySize() int {
return _entropySize
}
// CLIEntropyThreshold returns the entropy threshold.
func CLIEntropyThreshold() float64 {
return _entropyThreshold
}
// CLIKeepShorter returns whether to keep short fragments.
func CLIKeepShorter() bool {
return _keepShorter
}
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
func OutputFormatOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_jsonOutput, "json-output", false,

View File

@@ -13,26 +13,11 @@ import (
"github.com/DavidGamba/go-getoptions"
)
// Super k-mer specific option variables.
// These reuse _kmerSize and _minimizerSize from options.go since
// only one subcommand runs at a time.
// SuperKmerOptionSet registers options specific to super k-mer extraction.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between m+1 and 31)."))
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers (must be between 1 and k-1)."))
}
// runSuper implements the "obik super" subcommand.
// It extracts super k-mers from DNA sequences.
func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
k := _kmerSize
m := _minimizerSize
k := CLIKmerSize()
m := CLIMinimizerSize()
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)