Add sparse index support for KDI files with fast seeking

This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include:

- New KdxIndex struct and related functions for loading, searching, and writing .kdx files
- Modified KdiReader to support seeking with the new index
- Updated KdiWriter to create .kdx index files during writing
- Enhanced KmerSetGroup.Contains to use the new index for faster lookups
- Added a new 'match' command to annotate sequences with k-mer match positions

The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
This commit is contained in:
Eric Coissac
2026-02-10 13:23:56 +01:00
parent 9babcc0fae
commit c6e04265f1
7 changed files with 642 additions and 30 deletions

170
pkg/obikmer/kdx.go Normal file
View File

@@ -0,0 +1,170 @@
package obikmer
import (
"encoding/binary"
"fmt"
"io"
"os"
"sort"
"strings"
)
// KDX file magic bytes: "KDX\x01"
var kdxMagic = [4]byte{'K', 'D', 'X', 0x01}
// defaultKdxStride is the number of k-mers between consecutive index entries.
const defaultKdxStride = 4096
// kdxEntry is a single entry in the sparse index: the absolute k-mer value
// and the byte offset in the corresponding .kdi file where that k-mer is stored.
type kdxEntry struct {
kmer uint64
offset uint64 // absolute byte offset in .kdi file
}
// KdxIndex is a sparse, in-memory index for a .kdi file.
// It stores one entry every `stride` k-mers, enabling O(log N / stride)
// binary search followed by at most `stride` linear scan steps.
type KdxIndex struct {
stride int
entries []kdxEntry
}
// LoadKdxIndex reads a .kdx file into memory.
// Returns (nil, nil) if the file does not exist (graceful degradation).
func LoadKdxIndex(path string) (*KdxIndex, error) {
f, err := os.Open(path)
if err != nil {
if os.IsNotExist(err) {
return nil, nil
}
return nil, err
}
defer f.Close()
// Read magic
var magic [4]byte
if _, err := io.ReadFull(f, magic[:]); err != nil {
return nil, fmt.Errorf("kdx: read magic: %w", err)
}
if magic != kdxMagic {
return nil, fmt.Errorf("kdx: bad magic %v", magic)
}
// Read stride (uint32 LE)
var buf4 [4]byte
if _, err := io.ReadFull(f, buf4[:]); err != nil {
return nil, fmt.Errorf("kdx: read stride: %w", err)
}
stride := int(binary.LittleEndian.Uint32(buf4[:]))
// Read count (uint32 LE)
if _, err := io.ReadFull(f, buf4[:]); err != nil {
return nil, fmt.Errorf("kdx: read count: %w", err)
}
count := int(binary.LittleEndian.Uint32(buf4[:]))
// Read entries
entries := make([]kdxEntry, count)
var buf16 [16]byte
for i := 0; i < count; i++ {
if _, err := io.ReadFull(f, buf16[:]); err != nil {
return nil, fmt.Errorf("kdx: read entry %d: %w", i, err)
}
entries[i] = kdxEntry{
kmer: binary.LittleEndian.Uint64(buf16[0:8]),
offset: binary.LittleEndian.Uint64(buf16[8:16]),
}
}
return &KdxIndex{
stride: stride,
entries: entries,
}, nil
}
// FindOffset locates the best starting point in the .kdi file to scan for
// the target k-mer. It returns:
// - offset: the byte offset in the .kdi file to seek to (positioned after
// the indexed k-mer, ready to read the next delta)
// - skipCount: the number of k-mers already consumed at that offset
// (to set the reader's internal counter)
// - ok: true if the index provides a useful starting point
//
// Index entries are recorded at k-mer count positions stride, 2*stride, etc.
// Entry i corresponds to the k-mer written at count = (i+1)*stride.
func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) {
if idx == nil || len(idx.entries) == 0 {
return 0, 0, false
}
// Binary search: find the largest entry with kmer <= target
i := sort.Search(len(idx.entries), func(i int) bool {
return idx.entries[i].kmer > target
})
// i is the first entry with kmer > target, so i-1 is the last with kmer <= target
if i == 0 {
// Target is before the first index entry.
// No useful jump point — caller should scan from the beginning.
return 0, 0, false
}
i-- // largest entry with kmer <= target
// Entry i was recorded after writing k-mer at count = (i+1)*stride
skipCount = uint64(i+1) * uint64(idx.stride)
return idx.entries[i].offset, skipCount, true
}
// Stride returns the stride of this index.
func (idx *KdxIndex) Stride() int {
return idx.stride
}
// Len returns the number of entries in this index.
func (idx *KdxIndex) Len() int {
return len(idx.entries)
}
// WriteKdxIndex writes a .kdx file from a slice of entries.
func WriteKdxIndex(path string, stride int, entries []kdxEntry) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
// Magic
if _, err := f.Write(kdxMagic[:]); err != nil {
return err
}
// Stride (uint32 LE)
var buf4 [4]byte
binary.LittleEndian.PutUint32(buf4[:], uint32(stride))
if _, err := f.Write(buf4[:]); err != nil {
return err
}
// Count (uint32 LE)
binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries)))
if _, err := f.Write(buf4[:]); err != nil {
return err
}
// Entries
var buf16 [16]byte
for _, e := range entries {
binary.LittleEndian.PutUint64(buf16[0:8], e.kmer)
binary.LittleEndian.PutUint64(buf16[8:16], e.offset)
if _, err := f.Write(buf16[:]); err != nil {
return err
}
}
return nil
}
// KdxPathForKdi returns the .kdx path corresponding to a .kdi path.
func KdxPathForKdi(kdiPath string) string {
return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx"
}