Add max frequency filtering and top-kmer saving capabilities

This commit introduces max frequency filtering to limit k-mer occurrences and adds functionality to save the N most frequent k-mers per set to CSV files. It also includes the ability to output k-mer frequency spectra as CSV and updates the CLI options accordingly.
This commit is contained in:
Eric Coissac
2026-02-10 09:26:46 +01:00
parent 56c1f4180c
commit f2937af1ad
7 changed files with 530 additions and 12 deletions

View File

@@ -31,11 +31,19 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
}
maxOcc := CLIMaxOccurrence()
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
opts = append(opts, obikmer.WithMinFrequency(minOcc))
}
if maxOcc > 0 {
opts = append(opts, obikmer.WithMaxFrequency(maxOcc))
}
if topN := CLISaveFreqKmer(); topN > 0 {
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
}
// Determine whether to append to existing group or create new
var builder *obikmer.KmerSetGroupBuilder
@@ -50,7 +58,11 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
}
} else {
// New group
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
if maxOcc > 0 {
log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc)
} else {
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
}
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
if err != nil {
return fmt.Errorf("failed to create kmer index builder: %w", err)
@@ -99,6 +111,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
if minOcc > 1 {
ksg.SetAttribute("min_occurrence", minOcc)
}
if maxOcc > 0 {
ksg.SetAttribute("max_occurrence", maxOcc)
}
if err := ksg.SaveMetadata(); err != nil {
return fmt.Errorf("failed to save metadata: %w", err)

View File

@@ -48,6 +48,12 @@ func OptionSet(opt *getoptions.GetOpt) {
SetSelectionOptionSet(rmCmd)
rmCmd.SetCommandFn(runRm)
// spectrum: output k-mer frequency spectrum as CSV
spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV")
SetSelectionOptionSet(spectrumCmd)
obiconvert.OutputModeOptionSet(spectrumCmd, false)
spectrumCmd.SetCommandFn(runSpectrum)
// super: extract super k-mers from sequences
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
obiconvert.InputOptionSet(superCmd)

View File

@@ -38,7 +38,9 @@ var _indexId = ""
var _metadataFormat = "toml"
var _setTag = make(map[string]string, 0)
var _minOccurrence = 1
var _maxOccurrence = 0
var _saveFullFilter = false
var _saveFreqKmer = 0
// KmerIndexOptionSet defines every option related to kmer index building.
func KmerIndexOptionSet(options *getoptions.GetOpt) {
@@ -64,8 +66,14 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
}
// CLIKmerSize returns the k-mer size.
@@ -114,11 +122,21 @@ func CLIMinOccurrence() int {
return _minOccurrence
}
// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
func CLIMaxOccurrence() int {
return _maxOccurrence
}
// CLISaveFullFilter returns whether to save the full frequency filter.
func CLISaveFullFilter() bool {
return _saveFullFilter
}
// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
func CLISaveFreqKmer() int {
return _saveFreqKmer
}
// CLIOutputDirectory returns the output directory path.
func CLIOutputDirectory() string {
return obiconvert.CLIOutPutFileName()

View File

@@ -0,0 +1,121 @@
package obik
import (
"context"
"encoding/csv"
"fmt"
"os"
"strconv"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// runSpectrum implements the "obik spectrum" subcommand.
// It outputs k-mer frequency spectra as CSV with one column per set.
func runSpectrum(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik spectrum [options] <index_directory>")
}
ksg, err := obikmer.OpenKmerSetGroup(args[0])
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
// Determine which sets to include
patterns := CLISetPatterns()
var indices []int
if len(patterns) > 0 {
indices, err = ksg.MatchSetIDs(patterns)
if err != nil {
return fmt.Errorf("failed to match set patterns: %w", err)
}
if len(indices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
} else {
// All sets
indices = make([]int, ksg.Size())
for i := range indices {
indices[i] = i
}
}
// Read spectra for selected sets
spectraMaps := make([]map[int]uint64, len(indices))
maxFreq := 0
for i, idx := range indices {
spectrum, err := ksg.Spectrum(idx)
if err != nil {
return fmt.Errorf("failed to read spectrum for set %d: %w", idx, err)
}
if spectrum == nil {
log.Warnf("No spectrum data for set %d (%s)", idx, ksg.SetIDOf(idx))
spectraMaps[i] = make(map[int]uint64)
continue
}
spectraMaps[i] = spectrum.ToMap()
if mf := spectrum.MaxFrequency(); mf > maxFreq {
maxFreq = mf
}
}
if maxFreq == 0 {
return fmt.Errorf("no spectrum data found in any selected set")
}
// Determine output destination
outFile := obiconvert.CLIOutPutFileName()
var w *csv.Writer
if outFile == "" || outFile == "-" {
w = csv.NewWriter(os.Stdout)
} else {
f, err := os.Create(outFile)
if err != nil {
return fmt.Errorf("failed to create output file: %w", err)
}
defer f.Close()
w = csv.NewWriter(f)
}
defer w.Flush()
// Build header: frequency, set_id_1, set_id_2, ...
header := make([]string, 1+len(indices))
header[0] = "frequency"
for i, idx := range indices {
id := ksg.SetIDOf(idx)
if id == "" {
id = fmt.Sprintf("set_%d", idx)
}
header[i+1] = id
}
if err := w.Write(header); err != nil {
return err
}
// Write rows for each frequency from 1 to maxFreq
record := make([]string, 1+len(indices))
for freq := 1; freq <= maxFreq; freq++ {
record[0] = strconv.Itoa(freq)
hasData := false
for i := range indices {
count := spectraMaps[i][freq]
record[i+1] = strconv.FormatUint(count, 10)
if count > 0 {
hasData = true
}
}
// Only write rows where at least one set has a non-zero count
if hasData {
if err := w.Write(record); err != nil {
return err
}
}
}
return nil
}