Files
obitools4/pkg/obitools/obik/index.go
Eric Coissac f2937af1ad Add max frequency filtering and top-kmer saving capabilities
This commit introduces max frequency filtering to limit k-mer occurrences and adds functionality to save the N most frequent k-mers per set to CSV files. It also includes the ability to output k-mer frequency spectra as CSV and updates the CLI options accordingly.
2026-02-10 09:27:04 +01:00

126 lines
3.2 KiB
Go

package obik
import (
"context"
"fmt"
"os"
"path/filepath"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
func runIndex(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
outDir := CLIOutputDirectory()
if outDir == "" || outDir == "-" {
return fmt.Errorf("--out option is required and must specify a directory path")
}
k := CLIKmerSize()
if k < 2 || k > 31 {
return fmt.Errorf("invalid k-mer size: %d (must be between 2 and 31)", k)
}
m := CLIMinimizerSize()
minOcc := CLIMinOccurrence()
if minOcc < 1 {
return fmt.Errorf("invalid min-occurrence: %d (must be >= 1)", minOcc)
}
maxOcc := CLIMaxOccurrence()
// Build options
var opts []obikmer.BuilderOption
if minOcc > 1 {
opts = append(opts, obikmer.WithMinFrequency(minOcc))
}
if maxOcc > 0 {
opts = append(opts, obikmer.WithMaxFrequency(maxOcc))
}
if topN := CLISaveFreqKmer(); topN > 0 {
opts = append(opts, obikmer.WithSaveFreqKmers(topN))
}
// Determine whether to append to existing group or create new
var builder *obikmer.KmerSetGroupBuilder
var err error
metaPath := filepath.Join(outDir, "metadata.toml")
if _, statErr := os.Stat(metaPath); statErr == nil {
// Existing group: append
log.Infof("Appending to existing kmer index at %s", outDir)
builder, err = obikmer.AppendKmerSetGroupBuilder(outDir, 1, opts...)
if err != nil {
return fmt.Errorf("failed to open existing kmer index for appending: %w", err)
}
} else {
// New group
if maxOcc > 0 {
log.Infof("Creating new kmer index: k=%d, m=%d, occurrence=[%d,%d]", k, m, minOcc, maxOcc)
} else {
log.Infof("Creating new kmer index: k=%d, m=%d, min-occurrence=%d", k, m, minOcc)
}
builder, err = obikmer.NewKmerSetGroupBuilder(outDir, k, m, 1, -1, opts...)
if err != nil {
return fmt.Errorf("failed to create kmer index builder: %w", err)
}
}
// Read and process sequences
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
seqCount := 0
for sequences.Next() {
batch := sequences.Get()
for _, seq := range batch.Slice() {
builder.AddSequence(0, seq)
seqCount++
}
}
log.Infof("Processed %d sequences", seqCount)
// Finalize
ksg, err := builder.Close()
if err != nil {
return fmt.Errorf("failed to finalize kmer index: %w", err)
}
// Apply index-id to the new set
newSetIdx := builder.StartIndex()
if id := CLIIndexId(); id != "" {
ksg.SetSetID(newSetIdx, id)
}
// Apply group-level tags (-S)
for key, value := range CLISetTag() {
ksg.SetAttribute(key, value)
}
// Apply per-set tags (-T) to the new set
for key, value := range _setMetaTags {
ksg.SetSetMetadata(newSetIdx, key, value)
}
if minOcc > 1 {
ksg.SetAttribute("min_occurrence", minOcc)
}
if maxOcc > 0 {
ksg.SetAttribute("max_occurrence", maxOcc)
}
if err := ksg.SaveMetadata(); err != nil {
return fmt.Errorf("failed to save metadata: %w", err)
}
log.Infof("Index contains %d k-mers for set %d in %s", ksg.Len(newSetIdx), newSetIdx, outDir)
log.Info("Done.")
return nil
}