Add sparse index support for KDI files with fast seeking

This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include:

- New KdxIndex struct and related functions for loading, searching, and writing .kdx files
- Modified KdiReader to support seeking with the new index
- Updated KdiWriter to create .kdx index files during writing
- Enhanced KmerSetGroup.Contains to use the new index for faster lookups
- Added a new 'match' command to annotate sequences with k-mer match positions

The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
This commit is contained in:
Eric Coissac
2026-02-10 13:23:56 +01:00
parent 9babcc0fae
commit c6e04265f1
7 changed files with 642 additions and 30 deletions

123
pkg/obitools/obik/match.go Normal file
View File

@@ -0,0 +1,123 @@
package obik
import (
"context"
"fmt"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/DavidGamba/go-getoptions"
)
// matchSliceWorker creates a SeqSliceWorker that annotates each sequence
// in a batch with k-mer match positions from the index.
// For each set, an attribute "kmer_matched_<setID>" is added containing
// a sorted []int of 0-based positions where matched k-mers start.
func matchSliceWorker(ksg *obikmer.KmerSetGroup, setIndices []int) obiseq.SeqSliceWorker {
return func(batch obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
if len(batch) == 0 {
return batch, nil
}
// Build slice of *BioSequence for PrepareQueries
seqs := make([]*obiseq.BioSequence, len(batch))
for i := range batch {
seqs[i] = batch[i]
}
// Prepare queries once (shared across sets)
queries := ksg.PrepareQueries(seqs)
// Match against each selected set
for _, setIdx := range setIndices {
result := ksg.MatchBatch(setIdx, queries)
setID := ksg.SetIDOf(setIdx)
if setID == "" {
setID = fmt.Sprintf("set_%d", setIdx)
}
attrName := "kmer_matched_" + setID
for seqIdx, positions := range result {
if len(positions) > 0 {
batch[seqIdx].SetAttribute(attrName, positions)
}
}
}
return batch, nil
}
}
// runMatch implements the "obik match" subcommand.
// It reads sequences, looks up their k-mers in a disk-based index,
// and annotates each sequence with match positions.
func runMatch(ctx context.Context, opt *getoptions.GetOpt, args []string) error {
if len(args) < 1 {
return fmt.Errorf("usage: obik match [options] <index_directory> [sequence_files...]")
}
indexDir := args[0]
seqArgs := args[1:]
// Open the k-mer index
ksg, err := obikmer.OpenKmerSetGroup(indexDir)
if err != nil {
return fmt.Errorf("failed to open kmer index: %w", err)
}
log.Infof("Opened index: k=%d, m=%d, %d partitions, %d set(s)",
ksg.K(), ksg.M(), ksg.Partitions(), ksg.Size())
// Resolve which sets to match against
patterns := CLISetPatterns()
var setIndices []int
if len(patterns) > 0 {
setIndices, err = ksg.MatchSetIDs(patterns)
if err != nil {
return fmt.Errorf("failed to match set patterns: %w", err)
}
if len(setIndices) == 0 {
return fmt.Errorf("no sets match the given patterns")
}
} else {
// All sets
setIndices = make([]int, ksg.Size())
for i := range setIndices {
setIndices[i] = i
}
}
// Log which sets we'll match
for _, idx := range setIndices {
id := ksg.SetIDOf(idx)
if id == "" {
id = fmt.Sprintf("set_%d", idx)
}
log.Infof("Matching against set %d (%s): %d k-mers", idx, id, ksg.Len(idx))
}
// Read sequences
sequences, err := obiconvert.CLIReadBioSequences(seqArgs...)
if err != nil {
return fmt.Errorf("failed to open sequence files: %w", err)
}
// Apply the batch worker
worker := matchSliceWorker(ksg, setIndices)
matched := sequences.MakeISliceWorker(
worker,
false,
obidefault.ParallelWorkers(),
)
obiconvert.CLIWriteBioSequences(matched, true)
obiutils.WaitForLastPipe()
return nil
}