Files
obitools4/pkg/obitools/obik/obik.go
Eric Coissac c6e04265f1 Add sparse index support for KDI files with fast seeking
This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include:

- New KdxIndex struct and related functions for loading, searching, and writing .kdx files
- Modified KdiReader to support seeking with the new index
- Updated KdiWriter to create .kdx index files during writing
- Enhanced KmerSetGroup.Contains to use the new index for faster lookups
- Added a new 'match' command to annotate sequences with k-mer match positions

The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
2026-02-10 13:24:24 +01:00

78 lines
2.8 KiB
Go

package obik
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
// OptionSet registers all obik subcommands on the root GetOpt.
func OptionSet(opt *getoptions.GetOpt) {
// index: build or extend a kmer index from sequence files
indexCmd := opt.NewCommand("index", "Build a disk-based kmer index from sequence files")
obiconvert.InputOptionSet(indexCmd)
obiconvert.OutputModeOptionSet(indexCmd, false)
KmerIndexOptionSet(indexCmd)
indexCmd.StringMapVar(&_setMetaTags, "tag", 1, 1,
indexCmd.Alias("T"),
indexCmd.ArgName("KEY=VALUE"),
indexCmd.Description("Per-set metadata tag (repeatable)."))
indexCmd.SetCommandFn(runIndex)
// ls: list sets in a kmer index
lsCmd := opt.NewCommand("ls", "List sets in a kmer index")
OutputFormatOptionSet(lsCmd)
SetSelectionOptionSet(lsCmd)
lsCmd.SetCommandFn(runLs)
// summary: detailed statistics
summaryCmd := opt.NewCommand("summary", "Show detailed statistics of a kmer index")
OutputFormatOptionSet(summaryCmd)
summaryCmd.BoolVar(&_jaccard, "jaccard", false,
summaryCmd.Description("Compute and display pairwise Jaccard distance matrix."))
summaryCmd.SetCommandFn(runSummary)
// cp: copy sets between indices
cpCmd := opt.NewCommand("cp", "Copy sets between kmer indices")
SetSelectionOptionSet(cpCmd)
ForceOptionSet(cpCmd)
cpCmd.SetCommandFn(runCp)
// mv: move sets between indices
mvCmd := opt.NewCommand("mv", "Move sets between kmer indices")
SetSelectionOptionSet(mvCmd)
ForceOptionSet(mvCmd)
mvCmd.SetCommandFn(runMv)
// rm: remove sets from an index
rmCmd := opt.NewCommand("rm", "Remove sets from a kmer index")
SetSelectionOptionSet(rmCmd)
rmCmd.SetCommandFn(runRm)
// spectrum: output k-mer frequency spectrum as CSV
spectrumCmd := opt.NewCommand("spectrum", "Output k-mer frequency spectrum as CSV")
SetSelectionOptionSet(spectrumCmd)
obiconvert.OutputModeOptionSet(spectrumCmd, false)
spectrumCmd.SetCommandFn(runSpectrum)
// super: extract super k-mers from sequences
superCmd := opt.NewCommand("super", "Extract super k-mers from sequence files")
obiconvert.InputOptionSet(superCmd)
obiconvert.OutputOptionSet(superCmd)
SuperKmerOptionSet(superCmd)
superCmd.SetCommandFn(runSuper)
// lowmask: mask low-complexity regions
lowmaskCmd := opt.NewCommand("lowmask", "Mask low-complexity regions in sequences using entropy")
obiconvert.InputOptionSet(lowmaskCmd)
obiconvert.OutputOptionSet(lowmaskCmd)
LowMaskOptionSet(lowmaskCmd)
lowmaskCmd.SetCommandFn(runLowmask)
// match: annotate sequences with k-mer match positions from an index
matchCmd := opt.NewCommand("match", "Annotate sequences with k-mer match positions from an index")
obiconvert.InputOptionSet(matchCmd)
obiconvert.OutputOptionSet(matchCmd)
SetSelectionOptionSet(matchCmd)
matchCmd.SetCommandFn(runMatch)
}