mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add sparse index support for KDI files with fast seeking
This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include: - New KdxIndex struct and related functions for loading, searching, and writing .kdx files - Modified KdiReader to support seeking with the new index - Updated KdiWriter to create .kdx index files during writing - Enhanced KmerSetGroup.Contains to use the new index for faster lookups - Added a new 'match' command to annotate sequences with k-mer match positions The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
This commit is contained in:
170
pkg/obikmer/kdx.go
Normal file
170
pkg/obikmer/kdx.go
Normal file
@@ -0,0 +1,170 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// KDX file magic bytes: "KDX\x01"
|
||||
var kdxMagic = [4]byte{'K', 'D', 'X', 0x01}
|
||||
|
||||
// defaultKdxStride is the number of k-mers between consecutive index entries.
|
||||
const defaultKdxStride = 4096
|
||||
|
||||
// kdxEntry is a single entry in the sparse index: the absolute k-mer value
|
||||
// and the byte offset in the corresponding .kdi file where that k-mer is stored.
|
||||
type kdxEntry struct {
|
||||
kmer uint64
|
||||
offset uint64 // absolute byte offset in .kdi file
|
||||
}
|
||||
|
||||
// KdxIndex is a sparse, in-memory index for a .kdi file.
|
||||
// It stores one entry every `stride` k-mers, enabling O(log N / stride)
|
||||
// binary search followed by at most `stride` linear scan steps.
|
||||
type KdxIndex struct {
|
||||
stride int
|
||||
entries []kdxEntry
|
||||
}
|
||||
|
||||
// LoadKdxIndex reads a .kdx file into memory.
|
||||
// Returns (nil, nil) if the file does not exist (graceful degradation).
|
||||
func LoadKdxIndex(path string) (*KdxIndex, error) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Read magic
|
||||
var magic [4]byte
|
||||
if _, err := io.ReadFull(f, magic[:]); err != nil {
|
||||
return nil, fmt.Errorf("kdx: read magic: %w", err)
|
||||
}
|
||||
if magic != kdxMagic {
|
||||
return nil, fmt.Errorf("kdx: bad magic %v", magic)
|
||||
}
|
||||
|
||||
// Read stride (uint32 LE)
|
||||
var buf4 [4]byte
|
||||
if _, err := io.ReadFull(f, buf4[:]); err != nil {
|
||||
return nil, fmt.Errorf("kdx: read stride: %w", err)
|
||||
}
|
||||
stride := int(binary.LittleEndian.Uint32(buf4[:]))
|
||||
|
||||
// Read count (uint32 LE)
|
||||
if _, err := io.ReadFull(f, buf4[:]); err != nil {
|
||||
return nil, fmt.Errorf("kdx: read count: %w", err)
|
||||
}
|
||||
count := int(binary.LittleEndian.Uint32(buf4[:]))
|
||||
|
||||
// Read entries
|
||||
entries := make([]kdxEntry, count)
|
||||
var buf16 [16]byte
|
||||
for i := 0; i < count; i++ {
|
||||
if _, err := io.ReadFull(f, buf16[:]); err != nil {
|
||||
return nil, fmt.Errorf("kdx: read entry %d: %w", i, err)
|
||||
}
|
||||
entries[i] = kdxEntry{
|
||||
kmer: binary.LittleEndian.Uint64(buf16[0:8]),
|
||||
offset: binary.LittleEndian.Uint64(buf16[8:16]),
|
||||
}
|
||||
}
|
||||
|
||||
return &KdxIndex{
|
||||
stride: stride,
|
||||
entries: entries,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// FindOffset locates the best starting point in the .kdi file to scan for
|
||||
// the target k-mer. It returns:
|
||||
// - offset: the byte offset in the .kdi file to seek to (positioned after
|
||||
// the indexed k-mer, ready to read the next delta)
|
||||
// - skipCount: the number of k-mers already consumed at that offset
|
||||
// (to set the reader's internal counter)
|
||||
// - ok: true if the index provides a useful starting point
|
||||
//
|
||||
// Index entries are recorded at k-mer count positions stride, 2*stride, etc.
|
||||
// Entry i corresponds to the k-mer written at count = (i+1)*stride.
|
||||
func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) {
|
||||
if idx == nil || len(idx.entries) == 0 {
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
// Binary search: find the largest entry with kmer <= target
|
||||
i := sort.Search(len(idx.entries), func(i int) bool {
|
||||
return idx.entries[i].kmer > target
|
||||
})
|
||||
// i is the first entry with kmer > target, so i-1 is the last with kmer <= target
|
||||
if i == 0 {
|
||||
// Target is before the first index entry.
|
||||
// No useful jump point — caller should scan from the beginning.
|
||||
return 0, 0, false
|
||||
}
|
||||
|
||||
i-- // largest entry with kmer <= target
|
||||
// Entry i was recorded after writing k-mer at count = (i+1)*stride
|
||||
skipCount = uint64(i+1) * uint64(idx.stride)
|
||||
return idx.entries[i].offset, skipCount, true
|
||||
}
|
||||
|
||||
// Stride returns the stride of this index.
|
||||
func (idx *KdxIndex) Stride() int {
|
||||
return idx.stride
|
||||
}
|
||||
|
||||
// Len returns the number of entries in this index.
|
||||
func (idx *KdxIndex) Len() int {
|
||||
return len(idx.entries)
|
||||
}
|
||||
|
||||
// WriteKdxIndex writes a .kdx file from a slice of entries.
|
||||
func WriteKdxIndex(path string, stride int, entries []kdxEntry) error {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
// Magic
|
||||
if _, err := f.Write(kdxMagic[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Stride (uint32 LE)
|
||||
var buf4 [4]byte
|
||||
binary.LittleEndian.PutUint32(buf4[:], uint32(stride))
|
||||
if _, err := f.Write(buf4[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Count (uint32 LE)
|
||||
binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries)))
|
||||
if _, err := f.Write(buf4[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Entries
|
||||
var buf16 [16]byte
|
||||
for _, e := range entries {
|
||||
binary.LittleEndian.PutUint64(buf16[0:8], e.kmer)
|
||||
binary.LittleEndian.PutUint64(buf16[8:16], e.offset)
|
||||
if _, err := f.Write(buf16[:]); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// KdxPathForKdi returns the .kdx path corresponding to a .kdi path.
|
||||
func KdxPathForKdi(kdiPath string) string {
|
||||
return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx"
|
||||
}
|
||||
Reference in New Issue
Block a user