Add sparse index support for KDI files with fast seeking

This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include:

- New KdxIndex struct and related functions for loading, searching, and writing .kdx files
- Modified KdiReader to support seeking with the new index
- Updated KdiWriter to create .kdx index files during writing
- Enhanced KmerSetGroup.Contains to use the new index for faster lookups
- Added a new 'match' command to annotate sequences with k-mer match positions

The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
This commit is contained in:
Eric Coissac
2026-02-10 13:23:56 +01:00
parent 9babcc0fae
commit c6e04265f1
7 changed files with 642 additions and 30 deletions

View File

@@ -219,21 +219,15 @@ func (ksg *KmerSetGroup) Len(setIndex ...int) uint64 {
}
// Contains checks if a k-mer is present in the specified set.
// Uses binary search on the appropriate partition's KDI file.
// Uses the .kdx sparse index (if available) for fast seeking within
// each partition, then a short linear scan of at most `stride` entries.
// All partitions are searched in parallel since the k-mer's partition
// is not known without its minimizer context.
func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
if setIndex < 0 || setIndex >= ksg.n {
return false
}
// Determine partition from minimizer
// For a canonical k-mer, we need to find which partition it would fall into.
// The partition is determined by the minimizer during construction.
// For Contains, we must scan all partitions of this set (linear search within each).
// A full binary-search approach would require an index file.
// For now, scan the partition determined by the k-mer's minimizer.
// Since we don't know the minimizer, we do a linear scan of all partitions.
// This is O(total_kmers / P) per partition on average.
// Optimization: scan all partitions in parallel
type result struct {
found bool
}
@@ -241,12 +235,20 @@ func (ksg *KmerSetGroup) Contains(setIndex int, kmer uint64) bool {
for p := 0; p < ksg.partitions; p++ {
go func(part int) {
r, err := NewKdiReader(ksg.partitionPath(setIndex, part))
r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, part))
if err != nil {
ch <- result{false}
return
}
defer r.Close()
// Use index to jump near the target
if err := r.SeekTo(kmer); err != nil {
ch <- result{false}
return
}
// Linear scan from the seek position
for {
v, ok := r.Next()
if !ok {
@@ -853,13 +855,21 @@ func (ksg *KmerSetGroup) CopySetsByIDTo(ids []string, destDir string, force bool
return nil, fmt.Errorf("obikmer: create dest set dir: %w", err)
}
// Copy all partition files
// Copy all partition files and their .kdx indices
for p := 0; p < ksg.partitions; p++ {
srcPath := ksg.partitionPath(srcIdx, p)
destPath := dest.partitionPath(destIdx, p)
if err := copyFile(srcPath, destPath); err != nil {
return nil, fmt.Errorf("obikmer: copy partition %d of set %q: %w", p, srcID, err)
}
// Copy .kdx index if it exists
srcKdx := KdxPathForKdi(srcPath)
if _, err := os.Stat(srcKdx); err == nil {
destKdx := KdxPathForKdi(destPath)
if err := copyFile(srcKdx, destKdx); err != nil {
return nil, fmt.Errorf("obikmer: copy index %d of set %q: %w", p, srcID, err)
}
}
}
// Copy spectrum.bin if it exists