Files
obitools4/pkg/obitools/obik/options.go
Eric Coissac ac41dd8a22 Refactor k-mer matching pipeline with improved concurrency and memory management
Refactor k-mer matching to use a pipeline architecture with improved concurrency and memory management:

- Replace sort.Slice with slices.SortFunc and cmp.Compare for better performance
- Introduce PreparedQueries struct to encapsulate query buckets with metadata
- Implement MergeQueries function to merge query buckets from multiple batches
- Rewrite MatchBatch to use pre-allocated results and mutexes instead of map-based accumulation
- Add seek optimization in matchPartition to reduce linear scanning
- Refactor match command to use a multi-stage pipeline with proper batching and merging
- Add index directory option for match command
- Improve parallel processing of sequence batches

This refactoring improves performance by reducing memory allocations, optimizing k-mer lookup, and implementing a more efficient pipeline for large-scale k-mer matching operations.
2026-02-10 22:10:36 +01:00

361 lines
11 KiB
Go

package obik
import (
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// MaskingMode defines how to handle low-complexity regions
type MaskingMode int
const (
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
SplitMode // Split sequence into high-complexity fragments
ExtractMode // Extract low-complexity fragments
)
// Output format flags
var _jsonOutput bool
var _csvOutput bool
var _yamlOutput bool
// Set selection flags
var _setPatterns []string
// Force flag
var _force bool
// Jaccard flag
var _jaccard bool
// Per-set tags for index subcommand
var _setMetaTags = make(map[string]string, 0)
// ==============================
// Shared kmer options (used by index, super, lowmask)
// ==============================
var _kmerSize = 31
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
// KmerSizeOptionSet registers --kmer-size / -k.
// Shared by index, super, and lowmask subcommands.
func KmerSizeOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.Alias("k"),
options.Description("Size of k-mers (must be between 2 and 31)."))
}
// MinimizerOptionSet registers --minimizer-size / -m.
// Shared by index and super subcommands.
func MinimizerOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
options.Alias("m"),
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
}
// ==============================
// Lowmask-specific options
// ==============================
var _entropySize = 6
var _entropyThreshold = 0.5
var _splitMode = false
var _extractMode = false
var _maskingChar = "."
var _keepShorter = false
// LowMaskOptionSet registers options specific to low-complexity masking.
func LowMaskOptionSet(options *getoptions.GetOpt) {
KmerSizeOptionSet(options)
options.IntVar(&_entropySize, "entropy-size", _entropySize,
options.Description("Maximum word size considered for entropy estimate."))
options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold,
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
options.BoolVar(&_splitMode, "extract-high", _splitMode,
options.Description("Extract only high-complexity regions."))
options.BoolVar(&_extractMode, "extract-low", _extractMode,
options.Description("Extract only low-complexity regions."))
options.StringVar(&_maskingChar, "masking-char", _maskingChar,
options.Description("Character used to mask low complexity regions."))
options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter,
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
}
// ==============================
// Index-specific options
// ==============================
var _indexId = ""
var _metadataFormat = "toml"
var _setTag = make(map[string]string, 0)
var _minOccurrence = 1
var _maxOccurrence = 0
var _saveFullFilter = false
var _saveFreqKmer = 0
var _indexEntropyThreshold = 0.0
var _indexEntropySize = 6
// KmerIndexOptionSet defines every option related to kmer index building.
func KmerIndexOptionSet(options *getoptions.GetOpt) {
KmerSizeOptionSet(options)
MinimizerOptionSet(options)
options.StringVar(&_indexId, "index-id", _indexId,
options.Description("Identifier for the kmer index."))
options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
options.Description("Format for metadata file (toml, yaml, json)."))
options.StringMapVar(&_setTag, "set-tag", 1, 1,
options.Alias("S"),
options.ArgName("KEY=VALUE"),
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
options.Description("Maximum word size for entropy filter computation (default 6)."))
}
// EntropyFilterOptionSet registers entropy filter options for commands
// that process existing indices (e.g. filter).
func EntropyFilterOptionSet(options *getoptions.GetOpt) {
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
options.Description("Maximum word size for entropy filter computation (default 6)."))
}
// ==============================
// Super kmer options
// ==============================
// SuperKmerOptionSet registers options specific to super k-mer extraction.
func SuperKmerOptionSet(options *getoptions.GetOpt) {
KmerSizeOptionSet(options)
MinimizerOptionSet(options)
}
// CLIKmerSize returns the k-mer size.
func CLIKmerSize() int {
return _kmerSize
}
// CLIMinimizerSize returns the effective minimizer size.
func CLIMinimizerSize() int {
m := _minimizerSize
if m < 0 {
m = obikmer.DefaultMinimizerSize(_kmerSize)
}
nworkers := obidefault.ParallelWorkers()
m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
return m
}
// CLIIndexId returns the index identifier.
func CLIIndexId() string {
return _indexId
}
// CLIMetadataFormat returns the metadata format.
func CLIMetadataFormat() obikmer.MetadataFormat {
switch strings.ToLower(_metadataFormat) {
case "toml":
return obikmer.FormatTOML
case "yaml":
return obikmer.FormatYAML
case "json":
return obikmer.FormatJSON
default:
log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
return obikmer.FormatTOML
}
}
// CLISetTag returns the group-level metadata key=value pairs.
func CLISetTag() map[string]string {
return _setTag
}
// CLIMinOccurrence returns the minimum occurrence threshold.
func CLIMinOccurrence() int {
return _minOccurrence
}
// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
func CLIMaxOccurrence() int {
return _maxOccurrence
}
// CLISaveFullFilter returns whether to save the full frequency filter.
func CLISaveFullFilter() bool {
return _saveFullFilter
}
// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
func CLISaveFreqKmer() int {
return _saveFreqKmer
}
// CLIOutputDirectory returns the output directory path.
func CLIOutputDirectory() string {
return obiconvert.CLIOutPutFileName()
}
// SetKmerSize sets the k-mer size (for testing).
func SetKmerSize(k int) {
_kmerSize = k
}
// SetMinimizerSize sets the minimizer size (for testing).
func SetMinimizerSize(m int) {
_minimizerSize = m
}
// SetMinOccurrence sets the minimum occurrence (for testing).
func SetMinOccurrence(n int) {
_minOccurrence = n
}
// CLIMaskingMode returns the masking mode from CLI flags.
func CLIMaskingMode() MaskingMode {
switch {
case _extractMode:
return ExtractMode
case _splitMode:
return SplitMode
default:
return MaskMode
}
}
// CLIMaskingChar returns the masking character, validated.
func CLIMaskingChar() byte {
mask := strings.TrimSpace(_maskingChar)
if len(mask) != 1 {
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
}
return []byte(mask)[0]
}
// CLIEntropySize returns the entropy word size.
func CLIEntropySize() int {
return _entropySize
}
// CLIEntropyThreshold returns the entropy threshold.
func CLIEntropyThreshold() float64 {
return _entropyThreshold
}
// CLIKeepShorter returns whether to keep short fragments.
func CLIKeepShorter() bool {
return _keepShorter
}
// ==============================
// Match-specific options
// ==============================
var _indexDirectory = ""
// IndexDirectoryOptionSet registers --index / -i (mandatory directory for match).
func IndexDirectoryOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_indexDirectory, "index", _indexDirectory,
options.Alias("i"),
options.Required(),
options.ArgName("DIRECTORY"),
options.Description("Path to the kmer index directory."))
}
// CLIIndexDirectory returns the --index directory path.
func CLIIndexDirectory() string {
return _indexDirectory
}
// CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
func CLIIndexEntropyThreshold() float64 {
return _indexEntropyThreshold
}
// CLIIndexEntropySize returns the entropy filter word size for index building.
func CLIIndexEntropySize() int {
return _indexEntropySize
}
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
func OutputFormatOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_jsonOutput, "json-output", false,
options.Description("Print results as JSON."))
options.BoolVar(&_csvOutput, "csv-output", false,
options.Description("Print results as CSV."))
options.BoolVar(&_yamlOutput, "yaml-output", false,
options.Description("Print results as YAML."))
}
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
func CLIOutFormat() string {
if _jsonOutput {
return "json"
}
if _csvOutput {
return "csv"
}
if _yamlOutput {
return "yaml"
}
return "text"
}
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
func SetSelectionOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_setPatterns, "set", 1, 1,
options.Alias("s"),
options.ArgName("PATTERN"),
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
}
// CLISetPatterns returns the --set patterns provided by the user.
func CLISetPatterns() []string {
return _setPatterns
}
// ForceOptionSet registers --force / -f.
func ForceOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&_force, "force", false,
options.Alias("f"),
options.Description("Force operation even if set ID already exists in destination."))
}
// CLIForce returns whether --force was specified.
func CLIForce() bool {
return _force
}