From 9babcc0fae6816d5818baa194c9a3e586f8e9230 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 10 Feb 2026 09:52:28 +0100 Subject: [PATCH] Refactor lowmask options and shared kmer options Refactor lowmask options to use shared kmer options and CLI getters This commit refactors the lowmask subcommand to use shared kmer options and CLI getters instead of local variables. It also moves the kmer size and minimizer size options to a shared location and adds new CLI getters for the lowmask options. - Move kmer size and minimizer size options to shared location - Add CLI getters for lowmask options - Refactor lowmask to use CLI getters - Remove unused strings import - Add MaskingMode type and related functions --- pkg/obitools/obik/lowmask.go | 74 ++------------------- pkg/obitools/obik/options.go | 121 ++++++++++++++++++++++++++++++++--- pkg/obitools/obik/super.go | 19 +----- 3 files changed, 121 insertions(+), 93 deletions(-) diff --git a/pkg/obitools/obik/lowmask.go b/pkg/obitools/obik/lowmask.go index 7710c1e..35c1505 100644 --- a/pkg/obitools/obik/lowmask.go +++ b/pkg/obitools/obik/lowmask.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "math" - "strings" log "github.com/sirupsen/logrus" @@ -16,67 +15,6 @@ import ( "github.com/DavidGamba/go-getoptions" ) -// MaskingMode defines how to handle low-complexity regions -type MaskingMode int - -const ( - MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters - SplitMode // Split sequence into high-complexity fragments - ExtractMode // Extract low-complexity fragments -) - -// Lowmask-specific option variables (separate from index/super kmer-size). -var _lowmaskKmerSize = 31 -var _lowmaskLevelMax = 6 -var _lowmaskThreshold = 0.5 -var _lowmaskSplitMode = false -var _lowmaskLowMode = false -var _lowmaskMaskChar = "." -var _lowmaskKeepShorter = false - -// LowMaskOptionSet registers options specific to low-complexity masking. -func LowMaskOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize, - options.Description("Size of the kmer considered to estimate entropy.")) - - options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax, - options.Description("Maximum word size considered for entropy estimate.")) - - options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold, - options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) - - options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode, - options.Description("Split sequences to remove masked regions.")) - - options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode, - options.Description("Extract only low-complexity regions.")) - - options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar, - options.Description("Character used to mask low complexity regions.")) - - options.BoolVar(&_lowmaskKeepShorter, "keep-shorter", _lowmaskKeepShorter, - options.Description("Keep fragments shorter than kmer-size in split/extract mode.")) -} - -func lowmaskMaskingMode() MaskingMode { - switch { - case _lowmaskLowMode: - return ExtractMode - case _lowmaskSplitMode: - return SplitMode - default: - return MaskMode - } -} - -func lowmaskMaskingChar() byte { - mask := strings.TrimSpace(_lowmaskMaskChar) - if len(mask) != 1 { - log.Fatalf("--masking-char option accepts a single character, not %s", mask) - } - return []byte(mask)[0] -} - // lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker { @@ -453,11 +391,11 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking // runLowmask implements the "obik lowmask" subcommand. // It masks low-complexity regions in DNA sequences using entropy-based detection. func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error { - kmerSize := _lowmaskKmerSize - levelMax := _lowmaskLevelMax - threshold := _lowmaskThreshold - mode := lowmaskMaskingMode() - maskChar := lowmaskMaskingChar() + kmerSize := CLIKmerSize() + levelMax := CLIEntropySize() + threshold := CLIEntropyThreshold() + mode := CLIMaskingMode() + maskChar := CLIMaskingChar() log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold) @@ -466,7 +404,7 @@ func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) erro return fmt.Errorf("failed to open sequence files: %w", err) } - worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, _lowmaskKeepShorter) + worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter()) masked := sequences.MakeIWorker( worker, diff --git a/pkg/obitools/obik/options.go b/pkg/obitools/obik/options.go index 7a15863..62b8434 100644 --- a/pkg/obitools/obik/options.go +++ b/pkg/obitools/obik/options.go @@ -11,6 +11,15 @@ import ( "github.com/DavidGamba/go-getoptions" ) +// MaskingMode defines how to handle low-complexity regions +type MaskingMode int + +const ( + MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters + SplitMode // Split sequence into high-complexity fragments + ExtractMode // Extract low-complexity fragments +) + // Output format flags var _jsonOutput bool var _csvOutput bool @@ -29,11 +38,66 @@ var _jaccard bool var _setMetaTags = make(map[string]string, 0) // ============================== -// Kmer index building options (moved from obikindex) +// Shared kmer options (used by index, super, lowmask) // ============================== var _kmerSize = 31 var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5) + +// KmerSizeOptionSet registers --kmer-size / -k. +// Shared by index, super, and lowmask subcommands. +func KmerSizeOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.Alias("k"), + options.Description("Size of k-mers (must be between 2 and 31).")) +} + +// MinimizerOptionSet registers --minimizer-size / -m. +// Shared by index and super subcommands. +func MinimizerOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, + options.Alias("m"), + options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5)).")) +} + +// ============================== +// Lowmask-specific options +// ============================== + +var _entropySize = 6 +var _entropyThreshold = 0.5 +var _splitMode = false +var _extractMode = false +var _maskingChar = "." +var _keepShorter = false + +// LowMaskOptionSet registers options specific to low-complexity masking. +func LowMaskOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + + options.IntVar(&_entropySize, "entropy-size", _entropySize, + options.Description("Maximum word size considered for entropy estimate.")) + + options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold, + options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) + + options.BoolVar(&_splitMode, "extract-high", _splitMode, + options.Description("Extract only high-complexity regions.")) + + options.BoolVar(&_extractMode, "extract-low", _extractMode, + options.Description("Extract only low-complexity regions.")) + + options.StringVar(&_maskingChar, "masking-char", _maskingChar, + options.Description("Character used to mask low complexity regions.")) + + options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter, + options.Description("Keep fragments shorter than kmer-size in split/extract mode.")) +} + +// ============================== +// Index-specific options +// ============================== + var _indexId = "" var _metadataFormat = "toml" var _setTag = make(map[string]string, 0) @@ -44,13 +108,8 @@ var _saveFreqKmer = 0 // KmerIndexOptionSet defines every option related to kmer index building. func KmerIndexOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_kmerSize, "kmer-size", _kmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between 2 and 31).")) - - options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, - options.Alias("m"), - options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5)).")) + KmerSizeOptionSet(options) + MinimizerOptionSet(options) options.StringVar(&_indexId, "index-id", _indexId, options.Description("Identifier for the kmer index.")) @@ -76,6 +135,16 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) { options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv).")) } +// ============================== +// Super kmer options +// ============================== + +// SuperKmerOptionSet registers options specific to super k-mer extraction. +func SuperKmerOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + MinimizerOptionSet(options) +} + // CLIKmerSize returns the k-mer size. func CLIKmerSize() int { return _kmerSize @@ -157,6 +226,42 @@ func SetMinOccurrence(n int) { _minOccurrence = n } +// CLIMaskingMode returns the masking mode from CLI flags. +func CLIMaskingMode() MaskingMode { + switch { + case _extractMode: + return ExtractMode + case _splitMode: + return SplitMode + default: + return MaskMode + } +} + +// CLIMaskingChar returns the masking character, validated. +func CLIMaskingChar() byte { + mask := strings.TrimSpace(_maskingChar) + if len(mask) != 1 { + log.Fatalf("--masking-char option accepts a single character, not %s", mask) + } + return []byte(mask)[0] +} + +// CLIEntropySize returns the entropy word size. +func CLIEntropySize() int { + return _entropySize +} + +// CLIEntropyThreshold returns the entropy threshold. +func CLIEntropyThreshold() float64 { + return _entropyThreshold +} + +// CLIKeepShorter returns whether to keep short fragments. +func CLIKeepShorter() bool { + return _keepShorter +} + // OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output. func OutputFormatOptionSet(options *getoptions.GetOpt) { options.BoolVar(&_jsonOutput, "json-output", false, diff --git a/pkg/obitools/obik/super.go b/pkg/obitools/obik/super.go index b6416a4..86d0d69 100644 --- a/pkg/obitools/obik/super.go +++ b/pkg/obitools/obik/super.go @@ -13,26 +13,11 @@ import ( "github.com/DavidGamba/go-getoptions" ) -// Super k-mer specific option variables. -// These reuse _kmerSize and _minimizerSize from options.go since -// only one subcommand runs at a time. - -// SuperKmerOptionSet registers options specific to super k-mer extraction. -func SuperKmerOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_kmerSize, "kmer-size", _kmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between m+1 and 31).")) - - options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, - options.Alias("m"), - options.Description("Size of minimizers (must be between 1 and k-1).")) -} - // runSuper implements the "obik super" subcommand. // It extracts super k-mers from DNA sequences. func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error { - k := _kmerSize - m := _minimizerSize + k := CLIKmerSize() + m := CLIMinimizerSize() if k < 2 || k > 31 { return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)