mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-26 05:50:52 +00:00
Add max frequency filtering and top-kmer saving capabilities
This commit introduces max frequency filtering to limit k-mer occurrences and adds functionality to save the N most frequent k-mers per set to CSV files. It also includes the ability to output k-mer frequency spectra as CSV and updates the CLI options accordingly.
This commit is contained in:
@@ -31,11 +31,19 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
||||
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
|
||||
}
|
||||
|
||||
maxOcc := CLIMaxOccurrence()
|
||||
|
||||
// Build options
|
||||
var opts []obikmer.BuilderOption
|
||||
if minOcc > 1 {
|
||||
opts = append(opts, obikmer.WithMinFrequency(minOcc))
|
||||
}
|
||||
if maxOcc > 0 {
|
||||
opts = append(opts, obikmer.WithMaxFrequency(maxOcc))
|
||||
}
|
||||
if topN := CLISaveFreqKmer(); topN > 0 {
|
||||
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
|
||||
}
|
||||
|
||||
// Determine whether to append to existing group or create new
|
||||
var builder *obikmer.KmerSetGroupBuilder
|
||||
@@ -50,7 +58,11 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
||||
}
|
||||
} else {
|
||||
// New group
|
||||
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
if maxOcc > 0 {
|
||||
log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc)
|
||||
} else {
|
||||
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
|
||||
}
|
||||
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create kmer index builder: %w", err)
|
||||
@@ -99,6 +111,9 @@ func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error
|
||||
if minOcc > 1 {
|
||||
ksg.SetAttribute("min_occurrence", minOcc)
|
||||
}
|
||||
if maxOcc > 0 {
|
||||
ksg.SetAttribute("max_occurrence", maxOcc)
|
||||
}
|
||||
|
||||
if err := ksg.SaveMetadata(); err != nil {
|
||||
return fmt.Errorf("failed to save metadata: %w", err)
|
||||
|
||||
@@ -48,6 +48,12 @@ func OptionSet(opt *getoptions.GetOpt) {
|
||||
SetSelectionOptionSet(rmCmd)
|
||||
rmCmd.SetCommandFn(runRm)
|
||||
|
||||
// spectrum: output k-mer frequency spectrum as CSV
|
||||
spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV")
|
||||
SetSelectionOptionSet(spectrumCmd)
|
||||
obiconvert.OutputModeOptionSet(spectrumCmd, false)
|
||||
spectrumCmd.SetCommandFn(runSpectrum)
|
||||
|
||||
// super: extract super k-mers from sequences
|
||||
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
|
||||
obiconvert.InputOptionSet(superCmd)
|
||||
|
||||
@@ -38,7 +38,9 @@ var _indexId = ""
|
||||
var _metadataFormat = "toml"
|
||||
var _setTag = make(map[string]string, 0)
|
||||
var _minOccurrence = 1
|
||||
var _maxOccurrence = 0
|
||||
var _saveFullFilter = false
|
||||
var _saveFreqKmer = 0
|
||||
|
||||
// KmerIndexOptionSet defines every option related to kmer index building.
|
||||
func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
@@ -64,8 +66,14 @@ func KmerIndexOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_minOccurrence, "min-occurrence", _minOccurrence,
|
||||
options.Description("Minimum number of occurrences for a k-mer to be kept (default 1 = keep all)."))
|
||||
|
||||
options.IntVar(&_maxOccurrence, "max-occurrence", _maxOccurrence,
|
||||
options.Description("Maximum number of occurrences for a k-mer to be kept (default 0 = no upper bound)."))
|
||||
|
||||
options.BoolVar(&_saveFullFilter, "save-full-filter", _saveFullFilter,
|
||||
options.Description("When using --min-occurrence > 1, save the full frequency filter instead of just the filtered index."))
|
||||
|
||||
options.IntVar(&_saveFreqKmer, "save-freq-kmer", _saveFreqKmer,
|
||||
options.Description("Save the N most frequent k-mers per set to a CSV file (top_kmers.csv)."))
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the k-mer size.
|
||||
@@ -114,11 +122,21 @@ func CLIMinOccurrence() int {
|
||||
return _minOccurrence
|
||||
}
|
||||
|
||||
// CLIMaxOccurrence returns the maximum occurrence threshold (0 = no upper bound).
|
||||
func CLIMaxOccurrence() int {
|
||||
return _maxOccurrence
|
||||
}
|
||||
|
||||
// CLISaveFullFilter returns whether to save the full frequency filter.
|
||||
func CLISaveFullFilter() bool {
|
||||
return _saveFullFilter
|
||||
}
|
||||
|
||||
// CLISaveFreqKmer returns the number of top frequent k-mers to save (0 = disabled).
|
||||
func CLISaveFreqKmer() int {
|
||||
return _saveFreqKmer
|
||||
}
|
||||
|
||||
// CLIOutputDirectory returns the output directory path.
|
||||
func CLIOutputDirectory() string {
|
||||
return obiconvert.CLIOutPutFileName()
|
||||
|
||||
121
pkg/obitools/obik/spectrum.go
Normal file
121
pkg/obitools/obik/spectrum.go
Normal file
@@ -0,0 +1,121 @@
|
||||
package obik
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
// runSpectrum implements the "obik spectrum" subcommand.
|
||||
// It outputs k-mer frequency spectra as CSV with one column per set.
|
||||
func runSpectrum(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
|
||||
if len(args) < 1 {
|
||||
return fmt.Errorf("usage: obik spectrum [options] <index_directory>")
|
||||
}
|
||||
|
||||
ksg, err := obikmer.OpenKmerSetGroup(args[0])
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to open kmer index: %w", err)
|
||||
}
|
||||
|
||||
// Determine which sets to include
|
||||
patterns := CLISetPatterns()
|
||||
var indices []int
|
||||
if len(patterns) > 0 {
|
||||
indices, err = ksg.MatchSetIDs(patterns)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to match set patterns: %w", err)
|
||||
}
|
||||
if len(indices) == 0 {
|
||||
return fmt.Errorf("no sets match the given patterns")
|
||||
}
|
||||
} else {
|
||||
// All sets
|
||||
indices = make([]int, ksg.Size())
|
||||
for i := range indices {
|
||||
indices[i] = i
|
||||
}
|
||||
}
|
||||
|
||||
// Read spectra for selected sets
|
||||
spectraMaps := make([]map[int]uint64, len(indices))
|
||||
maxFreq := 0
|
||||
for i, idx := range indices {
|
||||
spectrum, err := ksg.Spectrum(idx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read spectrum for set %d: %w", idx, err)
|
||||
}
|
||||
if spectrum == nil {
|
||||
log.Warnf("No spectrum data for set %d (%s)", idx, ksg.SetIDOf(idx))
|
||||
spectraMaps[i] = make(map[int]uint64)
|
||||
continue
|
||||
}
|
||||
spectraMaps[i] = spectrum.ToMap()
|
||||
if mf := spectrum.MaxFrequency(); mf > maxFreq {
|
||||
maxFreq = mf
|
||||
}
|
||||
}
|
||||
|
||||
if maxFreq == 0 {
|
||||
return fmt.Errorf("no spectrum data found in any selected set")
|
||||
}
|
||||
|
||||
// Determine output destination
|
||||
outFile := obiconvert.CLIOutPutFileName()
|
||||
var w *csv.Writer
|
||||
if outFile == "" || outFile == "-" {
|
||||
w = csv.NewWriter(os.Stdout)
|
||||
} else {
|
||||
f, err := os.Create(outFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output file: %w", err)
|
||||
}
|
||||
defer f.Close()
|
||||
w = csv.NewWriter(f)
|
||||
}
|
||||
defer w.Flush()
|
||||
|
||||
// Build header: frequency, set_id_1, set_id_2, ...
|
||||
header := make([]string, 1+len(indices))
|
||||
header[0] = "frequency"
|
||||
for i, idx := range indices {
|
||||
id := ksg.SetIDOf(idx)
|
||||
if id == "" {
|
||||
id = fmt.Sprintf("set_%d", idx)
|
||||
}
|
||||
header[i+1] = id
|
||||
}
|
||||
if err := w.Write(header); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Write rows for each frequency from 1 to maxFreq
|
||||
record := make([]string, 1+len(indices))
|
||||
for freq := 1; freq <= maxFreq; freq++ {
|
||||
record[0] = strconv.Itoa(freq)
|
||||
hasData := false
|
||||
for i := range indices {
|
||||
count := spectraMaps[i][freq]
|
||||
record[i+1] = strconv.FormatUint(count, 10)
|
||||
if count > 0 {
|
||||
hasData = true
|
||||
}
|
||||
}
|
||||
// Only write rows where at least one set has a non-zero count
|
||||
if hasData {
|
||||
if err := w.Write(record); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user