mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Refactor k-mer matching to use a pipeline architecture with improved concurrency and memory management: - Replace sort.Slice with slices.SortFunc and cmp.Compare for better performance - Introduce PreparedQueries struct to encapsulate query buckets with metadata - Implement MergeQueries function to merge query buckets from multiple batches - Rewrite MatchBatch to use pre-allocated results and mutexes instead of map-based accumulation - Add seek optimization in matchPartition to reduce linear scanning - Refactor match command to use a multi-stage pipeline with proper batching and merging - Add index directory option for match command - Improve parallel processing of sequence batches This refactoring improves performance by reducing memory allocations, optimizing k-mer lookup, and implementing a more efficient pipeline for large-scale k-mer matching operations.
361 lines
11 KiB
Go
361 lines
11 KiB
Go
package obik
|
|
|
|
import (
|
|
"strings"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
|
"github.com/DavidGamba/go-getoptions"
|
|
)
|
|
|
|
// MaskingMode defines how to handle low-complexity regions
|
|
type MaskingMode int
|
|
|
|
const (
|
|
MaskMode MaskingMode = iota // Replace low-complexity regions with masked characters
|
|
SplitMode // Split sequence into high-complexity fragments
|
|
ExtractMode // Extract low-complexity fragments
|
|
)
|
|
|
|
// Output format flags
|
|
var _jsonOutput bool
|
|
var _csvOutput bool
|
|
var _yamlOutput bool
|
|
|
|
// Set selection flags
|
|
var _setPatterns []string
|
|
|
|
// Force flag
|
|
var _force bool
|
|
|
|
// Jaccard flag
|
|
var _jaccard bool
|
|
|
|
// Per-set tags for index subcommand
|
|
var _setMetaTags = make(map[string]string, 0)
|
|
|
|
// ==============================
|
|
// Shared kmer options (used by index, super, lowmask)
|
|
// ==============================
|
|
|
|
var _kmerSize = 31
|
|
var _minimizerSize = -1 // -1 means auto: ceil(k / 2.5)
|
|
|
|
// KmerSizeOptionSet registers --kmer-size / -k.
|
|
// Shared by index, super, and lowmask subcommands.
|
|
func KmerSizeOptionSet(options *getoptions.GetOpt) {
|
|
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
|
options.Alias("k"),
|
|
options.Description("Size of k-mers (must be between 2 and 31)."))
|
|
}
|
|
|
|
// MinimizerOptionSet registers --minimizer-size / -m.
|
|
// Shared by index and super subcommands.
|
|
func MinimizerOptionSet(options *getoptions.GetOpt) {
|
|
options.IntVar(&_minimizerSize, "minimizer-size", _minimizerSize,
|
|
options.Alias("m"),
|
|
options.Description("Size of minimizers for parallelization (-1 for auto = ceil(k/2.5))."))
|
|
}
|
|
|
|
// ==============================
|
|
// Lowmask-specific options
|
|
// ==============================
|
|
|
|
var _entropySize = 6
|
|
var _entropyThreshold = 0.5
|
|
var _splitMode = false
|
|
var _extractMode = false
|
|
var _maskingChar = "."
|
|
var _keepShorter = false
|
|
|
|
// LowMaskOptionSet registers options specific to low-complexity masking.
|
|
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
|
KmerSizeOptionSet(options)
|
|
|
|
options.IntVar(&_entropySize, "entropy-size", _entropySize,
|
|
options.Description("Maximum word size considered for entropy estimate."))
|
|
|
|
options.Float64Var(&_entropyThreshold, "threshold", _entropyThreshold,
|
|
options.Description("Entropy threshold below which a kmer is masked (0 to 1)."))
|
|
|
|
options.BoolVar(&_splitMode, "extract-high", _splitMode,
|
|
options.Description("Extract only high-complexity regions."))
|
|
|
|
options.BoolVar(&_extractMode, "extract-low", _extractMode,
|
|
options.Description("Extract only low-complexity regions."))
|
|
|
|
options.StringVar(&_maskingChar, "masking-char", _maskingChar,
|
|
options.Description("Character used to mask low complexity regions."))
|
|
|
|
options.BoolVar(&_keepShorter, "keep-shorter", _keepShorter,
|
|
options.Description("Keep fragments shorter than kmer-size in split/extract mode."))
|
|
}
|
|
|
|
// ==============================
|
|
// Index-specific options
|
|
// ==============================
|
|
|
|
var _indexId = ""
|
|
var _metadataFormat = "toml"
|
|
var _setTag = make(map[string]string, 0)
|
|
var _minOccurrence = 1
|
|
var _maxOccurrence = 0
|
|
var _saveFullFilter = false
|
|
var _saveFreqKmer = 0
|
|
var _indexEntropyThreshold = 0.0
|
|
var _indexEntropySize = 6
|
|
|
|
// KmerIndexOptionSet defines every option related to kmer index building.
|
|
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
|
KmerSizeOptionSet(options)
|
|
MinimizerOptionSet(options)
|
|
|
|
options.StringVar(&_indexId, "index-id", _indexId,
|
|
options.Description("Identifier for the kmer index."))
|
|
|
|
options.StringVar(&_metadataFormat, "metadata-format", _metadataFormat,
|
|
options.Description("Format for metadata file (toml, yaml, json)."))
|
|
|
|
options.StringMapVar(&_setTag, "set-tag", 1, 1,
|
|
options.Alias("S"),
|
|
options.ArgName("KEY=VALUE"),
|
|
options.Description("Adds a group-level metadata attribute KEY with value VALUE."))
|
|
|
|
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
|
|
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
|
|
|
|
options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
|
|
options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
|
|
|
|
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
|
|
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
|
|
|
|
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
|
|
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
|
|
|
|
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
|
|
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
|
|
|
|
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
|
|
options.Description("Maximum word size for entropy filter computation (default 6)."))
|
|
}
|
|
|
|
// EntropyFilterOptionSet registers entropy filter options for commands
|
|
// that process existing indices (e.g. filter).
|
|
func EntropyFilterOptionSet(options *getoptions.GetOpt) {
|
|
options.Float64Var(&_indexEntropyThreshold, "entropy-filter", _indexEntropyThreshold,
|
|
options.Description("Filter low-complexity k-mers with entropy <= threshold (0 = disabled)."))
|
|
|
|
options.IntVar(&_indexEntropySize, "entropy-filter-size", _indexEntropySize,
|
|
options.Description("Maximum word size for entropy filter computation (default 6)."))
|
|
}
|
|
|
|
// ==============================
|
|
// Super kmer options
|
|
// ==============================
|
|
|
|
// SuperKmerOptionSet registers options specific to super k-mer extraction.
|
|
func SuperKmerOptionSet(options *getoptions.GetOpt) {
|
|
KmerSizeOptionSet(options)
|
|
MinimizerOptionSet(options)
|
|
}
|
|
|
|
// CLIKmerSize returns the k-mer size.
|
|
func CLIKmerSize() int {
|
|
return _kmerSize
|
|
}
|
|
|
|
// CLIMinimizerSize returns the effective minimizer size.
|
|
func CLIMinimizerSize() int {
|
|
m := _minimizerSize
|
|
if m < 0 {
|
|
m = obikmer.DefaultMinimizerSize(_kmerSize)
|
|
}
|
|
nworkers := obidefault.ParallelWorkers()
|
|
m = obikmer.ValidateMinimizerSize(m, _kmerSize, nworkers)
|
|
return m
|
|
}
|
|
|
|
// CLIIndexId returns the index identifier.
|
|
func CLIIndexId() string {
|
|
return _indexId
|
|
}
|
|
|
|
// CLIMetadataFormat returns the metadata format.
|
|
func CLIMetadataFormat() obikmer.MetadataFormat {
|
|
switch strings.ToLower(_metadataFormat) {
|
|
case "toml":
|
|
return obikmer.FormatTOML
|
|
case "yaml":
|
|
return obikmer.FormatYAML
|
|
case "json":
|
|
return obikmer.FormatJSON
|
|
default:
|
|
log.Warnf("Unknown metadata format %q, defaulting to TOML", _metadataFormat)
|
|
return obikmer.FormatTOML
|
|
}
|
|
}
|
|
|
|
// CLISetTag returns the group-level metadata key=value pairs.
|
|
func CLISetTag() map[string]string {
|
|
return _setTag
|
|
}
|
|
|
|
// CLIMinOccurrence returns the minimum occurrence threshold.
|
|
func CLIMinOccurrence() int {
|
|
return _minOccurrence
|
|
}
|
|
|
|
// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
|
|
func CLIMaxOccurrence() int {
|
|
return _maxOccurrence
|
|
}
|
|
|
|
// CLISaveFullFilter returns whether to save the full frequency filter.
|
|
func CLISaveFullFilter() bool {
|
|
return _saveFullFilter
|
|
}
|
|
|
|
// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
|
|
func CLISaveFreqKmer() int {
|
|
return _saveFreqKmer
|
|
}
|
|
|
|
// CLIOutputDirectory returns the output directory path.
|
|
func CLIOutputDirectory() string {
|
|
return obiconvert.CLIOutPutFileName()
|
|
}
|
|
|
|
// SetKmerSize sets the k-mer size (for testing).
|
|
func SetKmerSize(k int) {
|
|
_kmerSize = k
|
|
}
|
|
|
|
// SetMinimizerSize sets the minimizer size (for testing).
|
|
func SetMinimizerSize(m int) {
|
|
_minimizerSize = m
|
|
}
|
|
|
|
// SetMinOccurrence sets the minimum occurrence (for testing).
|
|
func SetMinOccurrence(n int) {
|
|
_minOccurrence = n
|
|
}
|
|
|
|
// CLIMaskingMode returns the masking mode from CLI flags.
|
|
func CLIMaskingMode() MaskingMode {
|
|
switch {
|
|
case _extractMode:
|
|
return ExtractMode
|
|
case _splitMode:
|
|
return SplitMode
|
|
default:
|
|
return MaskMode
|
|
}
|
|
}
|
|
|
|
// CLIMaskingChar returns the masking character, validated.
|
|
func CLIMaskingChar() byte {
|
|
mask := strings.TrimSpace(_maskingChar)
|
|
if len(mask) != 1 {
|
|
log.Fatalf("--masking-char option accepts a single character, not %s", mask)
|
|
}
|
|
return []byte(mask)[0]
|
|
}
|
|
|
|
// CLIEntropySize returns the entropy word size.
|
|
func CLIEntropySize() int {
|
|
return _entropySize
|
|
}
|
|
|
|
// CLIEntropyThreshold returns the entropy threshold.
|
|
func CLIEntropyThreshold() float64 {
|
|
return _entropyThreshold
|
|
}
|
|
|
|
// CLIKeepShorter returns whether to keep short fragments.
|
|
func CLIKeepShorter() bool {
|
|
return _keepShorter
|
|
}
|
|
|
|
// ==============================
|
|
// Match-specific options
|
|
// ==============================
|
|
|
|
var _indexDirectory = ""
|
|
|
|
// IndexDirectoryOptionSet registers --index / -i (mandatory directory for match).
|
|
func IndexDirectoryOptionSet(options *getoptions.GetOpt) {
|
|
options.StringVar(&_indexDirectory, "index", _indexDirectory,
|
|
options.Alias("i"),
|
|
options.Required(),
|
|
options.ArgName("DIRECTORY"),
|
|
options.Description("Path to the kmer index directory."))
|
|
}
|
|
|
|
// CLIIndexDirectory returns the --index directory path.
|
|
func CLIIndexDirectory() string {
|
|
return _indexDirectory
|
|
}
|
|
|
|
// CLIIndexEntropyThreshold returns the entropy filter threshold for index building (0 = disabled).
|
|
func CLIIndexEntropyThreshold() float64 {
|
|
return _indexEntropyThreshold
|
|
}
|
|
|
|
// CLIIndexEntropySize returns the entropy filter word size for index building.
|
|
func CLIIndexEntropySize() int {
|
|
return _indexEntropySize
|
|
}
|
|
|
|
// OutputFormatOptionSet registers --json-output, --csv-output, --yaml-output.
|
|
func OutputFormatOptionSet(options *getoptions.GetOpt) {
|
|
options.BoolVar(&_jsonOutput, "json-output", false,
|
|
options.Description("Print results as JSON."))
|
|
options.BoolVar(&_csvOutput, "csv-output", false,
|
|
options.Description("Print results as CSV."))
|
|
options.BoolVar(&_yamlOutput, "yaml-output", false,
|
|
options.Description("Print results as YAML."))
|
|
}
|
|
|
|
// CLIOutFormat returns the selected output format: "json", "csv", "yaml", or "text".
|
|
func CLIOutFormat() string {
|
|
if _jsonOutput {
|
|
return "json"
|
|
}
|
|
if _csvOutput {
|
|
return "csv"
|
|
}
|
|
if _yamlOutput {
|
|
return "yaml"
|
|
}
|
|
return "text"
|
|
}
|
|
|
|
// SetSelectionOptionSet registers --set <glob_pattern> (repeatable).
|
|
func SetSelectionOptionSet(options *getoptions.GetOpt) {
|
|
options.StringSliceVar(&_setPatterns, "set", 1, 1,
|
|
options.Alias("s"),
|
|
options.ArgName("PATTERN"),
|
|
options.Description("Set ID or glob pattern (repeatable, supports *, ?, [...])."))
|
|
}
|
|
|
|
// CLISetPatterns returns the --set patterns provided by the user.
|
|
func CLISetPatterns() []string {
|
|
return _setPatterns
|
|
}
|
|
|
|
// ForceOptionSet registers --force / -f.
|
|
func ForceOptionSet(options *getoptions.GetOpt) {
|
|
options.BoolVar(&_force, "force", false,
|
|
options.Alias("f"),
|
|
options.Description("Force operation even if set ID already exists in destination."))
|
|
}
|
|
|
|
// CLIForce returns whether --force was specified.
|
|
func CLIForce() bool {
|
|
return _force
|
|
}
|