diff --git a/pkg/obitools/obik/lowmask.go b/pkg/obitools/obik/lowmask.go index 7710c1e..35c1505 100644 --- a/pkg/obitools/obik/lowmask.go +++ b/pkg/obitools/obik/lowmask.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "math" - "strings" log "github.com/sirupsen/logrus" @@ -16,67 +15,6 @@ import ( "github.com/DavidGamba/go-getoptions" ) -// MaskingMode defines how to handle low-complexity regions -type MaskingMode int - -const ( - MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters - SplitMode // Split sequence into high-complexity fragments - ExtractMode // Extract low-complexity fragments -) - -// Lowmask-specific option variables (separate from index/super kmer-size). -var _lowmaskKmerSize = 31 -var _lowmaskLevelMax = 6 -var _lowmaskThreshold = 0.5 -var _lowmaskSplitMode = false -var _lowmaskLowMode = false -var _lowmaskMaskChar = "." -var _lowmaskKeepShorter = false - -// LowMaskOptionSet registers options specific to low-complexity masking. -func LowMaskOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_lowmaskKmerSize, "kmer-size", _lowmaskKmerSize, - options.Description("Size of the kmer considered to estimate entropy.")) - - options.IntVar(&_lowmaskLevelMax, "entropy-size", _lowmaskLevelMax, - options.Description("Maximum word size considered for entropy estimate.")) - - options.Float64Var(&_lowmaskThreshold, "threshold", _lowmaskThreshold, - options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) - - options.BoolVar(&_lowmaskSplitMode, "split-mode", _lowmaskSplitMode, - options.Description("Split sequences to remove masked regions.")) - - options.BoolVar(&_lowmaskLowMode, "low-mode", _lowmaskLowMode, - options.Description("Extract only low-complexity regions.")) - - options.StringVar(&_lowmaskMaskChar, "masking-char", _lowmaskMaskChar, - options.Description("Character used to mask low complexity regions.")) - - options.BoolVar(&_lowmaskKeepShorter, "keep-shorter", _lowmaskKeepShorter, - options.Description("Keep fragments shorter than kmer-size in split/extract mode.")) -} - -func lowmaskMaskingMode() MaskingMode { - switch { - case _lowmaskLowMode: - return ExtractMode - case _lowmaskSplitMode: - return SplitMode - default: - return MaskMode - } -} - -func lowmaskMaskingChar() byte { - mask := strings.TrimSpace(_lowmaskMaskChar) - if len(mask) != 1 { - log.Fatalf("--masking-char option accepts a single character, not %s", mask) - } - return []byte(mask)[0] -} - // lowMaskWorker creates a worker to mask low-complexity regions in DNA sequences. func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode MaskingMode, maskChar byte, keepShorter bool) obiseq.SeqWorker { @@ -453,11 +391,11 @@ func lowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking // runLowmask implements the "obik lowmask" subcommand. // It masks low-complexity regions in DNA sequences using entropy-based detection. func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) error { - kmerSize := _lowmaskKmerSize - levelMax := _lowmaskLevelMax - threshold := _lowmaskThreshold - mode := lowmaskMaskingMode() - maskChar := lowmaskMaskingChar() + kmerSize := CLIKmerSize() + levelMax := CLIEntropySize() + threshold := CLIEntropyThreshold() + mode := CLIMaskingMode() + maskChar := CLIMaskingChar() log.Printf("Low-complexity masking: kmer-size=%d, entropy-size=%d, threshold=%.4f", kmerSize, levelMax, threshold) @@ -466,7 +404,7 @@ func runLowmask(ctx context.Context, opt *getoptions.GetOpt, args []string) erro return fmt.Errorf("failed to open sequence files: %w", err) } - worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, _lowmaskKeepShorter) + worker := lowMaskWorker(kmerSize, levelMax, threshold, mode, maskChar, CLIKeepShorter()) masked := sequences.MakeIWorker( worker, diff --git a/pkg/obitools/obik/options.go b/pkg/obitools/obik/options.go index 7a15863..62b8434 100644 --- a/pkg/obitools/obik/options.go +++ b/pkg/obitools/obik/options.go @@ -11,6 +11,15 @@ import ( "github.com/DavidGamba/go-getoptions" ) +// MaskingMode defines how to handle low-complexity regions +type MaskingMode int + +const ( + MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters + SplitMode // Split sequence into high-complexity fragments + ExtractMode // Extract low-complexity fragments +) + // Output format flags var _jsonOutput bool var _csvOutput bool @@ -29,11 +38,66 @@ var _jaccard bool var _setMetaTags = make(map[string]string, 0) // ============================== -// Kmer index building options (moved from obikindex) +// Shared kmer options (used by index, super, lowmask) // ============================== var _kmerSize = 31 var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5) + +// KmerSizeOptionSet registers --kmer-size / -k. +// Shared by index, super, and lowmask subcommands. +func KmerSizeOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_kmerSize, "kmer-size", _kmerSize, + options.Alias("k"), + options.Description("Size of k-mers (must be between 2 and 31).")) +} + +// MinimizerOptionSet registers --minimizer-size / -m. +// Shared by index and super subcommands. +func MinimizerOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, + options.Alias("m"), + options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5)).")) +} + +// ============================== +// Lowmask-specific options +// ============================== + +var _entropySize = 6 +var _entropyThreshold = 0.5 +var _splitMode = false +var _extractMode = false +var _maskingChar = "." +var _keepShorter = false + +// LowMaskOptionSet registers options specific to low-complexity masking. +func LowMaskOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + + options.IntVar(&_entropySize, "entropy-size", _entropySize, + options.Description("Maximum word size considered for entropy estimate.")) + + options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold, + options.Description("Entropy threshold below which a kmer is masked (0 to 1).")) + + options.BoolVar(&_splitMode, "extract-high", _splitMode, + options.Description("Extract only high-complexity regions.")) + + options.BoolVar(&_extractMode, "extract-low", _extractMode, + options.Description("Extract only low-complexity regions.")) + + options.StringVar(&_maskingChar, "masking-char", _maskingChar, + options.Description("Character used to mask low complexity regions.")) + + options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter, + options.Description("Keep fragments shorter than kmer-size in split/extract mode.")) +} + +// ============================== +// Index-specific options +// ============================== + var _indexId = "" var _metadataFormat = "toml" var _setTag = make(map[string]string, 0) @@ -44,13 +108,8 @@ var _saveFreqKmer = 0 // KmerIndexOptionSet defines every option related to kmer index building. func KmerIndexOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_kmerSize, "kmer-size", _kmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between 2 and 31).")) - - options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, - options.Alias("m"), - options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5)).")) + KmerSizeOptionSet(options) + MinimizerOptionSet(options) options.StringVar(&_indexId, "index-id", _indexId, options.Description("Identifier for the kmer index.")) @@ -76,6 +135,16 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) { options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv).")) } +// ============================== +// Super kmer options +// ============================== + +// SuperKmerOptionSet registers options specific to super k-mer extraction. +func SuperKmerOptionSet(options *getoptions.GetOpt) { + KmerSizeOptionSet(options) + MinimizerOptionSet(options) +} + // CLIKmerSize returns the k-mer size. func CLIKmerSize() int { return _kmerSize @@ -157,6 +226,42 @@ func SetMinOccurrence(n int) { _minOccurrence = n } +// CLIMaskingMode returns the masking mode from CLI flags. +func CLIMaskingMode() MaskingMode { + switch { + case _extractMode: + return ExtractMode + case _splitMode: + return SplitMode + default: + return MaskMode + } +} + +// CLIMaskingChar returns the masking character, validated. +func CLIMaskingChar() byte { + mask := strings.TrimSpace(_maskingChar) + if len(mask) != 1 { + log.Fatalf("--masking-char option accepts a single character, not %s", mask) + } + return []byte(mask)[0] +} + +// CLIEntropySize returns the entropy word size. +func CLIEntropySize() int { + return _entropySize +} + +// CLIEntropyThreshold returns the entropy threshold. +func CLIEntropyThreshold() float64 { + return _entropyThreshold +} + +// CLIKeepShorter returns whether to keep short fragments. +func CLIKeepShorter() bool { + return _keepShorter +} + // OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output. func OutputFormatOptionSet(options *getoptions.GetOpt) { options.BoolVar(&_jsonOutput, "json-output", false, diff --git a/pkg/obitools/obik/super.go b/pkg/obitools/obik/super.go index b6416a4..86d0d69 100644 --- a/pkg/obitools/obik/super.go +++ b/pkg/obitools/obik/super.go @@ -13,26 +13,11 @@ import ( "github.com/DavidGamba/go-getoptions" ) -// Super k-mer specific option variables. -// These reuse _kmerSize and _minimizerSize from options.go since -// only one subcommand runs at a time. - -// SuperKmerOptionSet registers options specific to super k-mer extraction. -func SuperKmerOptionSet(options *getoptions.GetOpt) { - options.IntVar(&_kmerSize, "kmer-size", _kmerSize, - options.Alias("k"), - options.Description("Size of k-mers (must be between m+1 and 31).")) - - options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize, - options.Alias("m"), - options.Description("Size of minimizers (must be between 1 and k-1).")) -} - // runSuper implements the "obik super" subcommand. // It extracts super k-mers from DNA sequences. func runSuper(ctx context.Context, opt *getoptions.GetOpt, args []string) error { - k := _kmerSize - m := _minimizerSize + k := CLIKmerSize() + m := CLIMinimizerSize() if k < 2 || k > 31 { return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)