mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include: - New KdxIndex struct and related functions for loading, searching, and writing .kdx files - Modified KdiReader to support seeking with the new index - Updated KdiWriter to create .kdx index files during writing - Enhanced KmerSetGroup.Contains to use the new index for faster lookups - Added a new 'match' command to annotate sequences with k-mer match positions The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
171 lines
4.6 KiB
Go
171 lines
4.6 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"encoding/binary"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// KDX file magic bytes: "KDX\x01"
|
|
var kdxMagic = [4]byte{'K', 'D', 'X', 0x01}
|
|
|
|
// defaultKdxStride is the number of k-mers between consecutive index entries.
|
|
const defaultKdxStride = 4096
|
|
|
|
// kdxEntry is a single entry in the sparse index: the absolute k-mer value
|
|
// and the byte offset in the corresponding .kdi file where that k-mer is stored.
|
|
type kdxEntry struct {
|
|
kmer uint64
|
|
offset uint64 // absolute byte offset in .kdi file
|
|
}
|
|
|
|
// KdxIndex is a sparse, in-memory index for a .kdi file.
|
|
// It stores one entry every `stride` k-mers, enabling O(log N / stride)
|
|
// binary search followed by at most `stride` linear scan steps.
|
|
type KdxIndex struct {
|
|
stride int
|
|
entries []kdxEntry
|
|
}
|
|
|
|
// LoadKdxIndex reads a .kdx file into memory.
|
|
// Returns (nil, nil) if the file does not exist (graceful degradation).
|
|
func LoadKdxIndex(path string) (*KdxIndex, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
|
|
// Read magic
|
|
var magic [4]byte
|
|
if _, err := io.ReadFull(f, magic[:]); err != nil {
|
|
return nil, fmt.Errorf("kdx: read magic: %w", err)
|
|
}
|
|
if magic != kdxMagic {
|
|
return nil, fmt.Errorf("kdx: bad magic %v", magic)
|
|
}
|
|
|
|
// Read stride (uint32 LE)
|
|
var buf4 [4]byte
|
|
if _, err := io.ReadFull(f, buf4[:]); err != nil {
|
|
return nil, fmt.Errorf("kdx: read stride: %w", err)
|
|
}
|
|
stride := int(binary.LittleEndian.Uint32(buf4[:]))
|
|
|
|
// Read count (uint32 LE)
|
|
if _, err := io.ReadFull(f, buf4[:]); err != nil {
|
|
return nil, fmt.Errorf("kdx: read count: %w", err)
|
|
}
|
|
count := int(binary.LittleEndian.Uint32(buf4[:]))
|
|
|
|
// Read entries
|
|
entries := make([]kdxEntry, count)
|
|
var buf16 [16]byte
|
|
for i := 0; i < count; i++ {
|
|
if _, err := io.ReadFull(f, buf16[:]); err != nil {
|
|
return nil, fmt.Errorf("kdx: read entry %d: %w", i, err)
|
|
}
|
|
entries[i] = kdxEntry{
|
|
kmer: binary.LittleEndian.Uint64(buf16[0:8]),
|
|
offset: binary.LittleEndian.Uint64(buf16[8:16]),
|
|
}
|
|
}
|
|
|
|
return &KdxIndex{
|
|
stride: stride,
|
|
entries: entries,
|
|
}, nil
|
|
}
|
|
|
|
// FindOffset locates the best starting point in the .kdi file to scan for
|
|
// the target k-mer. It returns:
|
|
// - offset: the byte offset in the .kdi file to seek to (positioned after
|
|
// the indexed k-mer, ready to read the next delta)
|
|
// - skipCount: the number of k-mers already consumed at that offset
|
|
// (to set the reader's internal counter)
|
|
// - ok: true if the index provides a useful starting point
|
|
//
|
|
// Index entries are recorded at k-mer count positions stride, 2*stride, etc.
|
|
// Entry i corresponds to the k-mer written at count = (i+1)*stride.
|
|
func (idx *KdxIndex) FindOffset(target uint64) (offset uint64, skipCount uint64, ok bool) {
|
|
if idx == nil || len(idx.entries) == 0 {
|
|
return 0, 0, false
|
|
}
|
|
|
|
// Binary search: find the largest entry with kmer <= target
|
|
i := sort.Search(len(idx.entries), func(i int) bool {
|
|
return idx.entries[i].kmer > target
|
|
})
|
|
// i is the first entry with kmer > target, so i-1 is the last with kmer <= target
|
|
if i == 0 {
|
|
// Target is before the first index entry.
|
|
// No useful jump point — caller should scan from the beginning.
|
|
return 0, 0, false
|
|
}
|
|
|
|
i-- // largest entry with kmer <= target
|
|
// Entry i was recorded after writing k-mer at count = (i+1)*stride
|
|
skipCount = uint64(i+1) * uint64(idx.stride)
|
|
return idx.entries[i].offset, skipCount, true
|
|
}
|
|
|
|
// Stride returns the stride of this index.
|
|
func (idx *KdxIndex) Stride() int {
|
|
return idx.stride
|
|
}
|
|
|
|
// Len returns the number of entries in this index.
|
|
func (idx *KdxIndex) Len() int {
|
|
return len(idx.entries)
|
|
}
|
|
|
|
// WriteKdxIndex writes a .kdx file from a slice of entries.
|
|
func WriteKdxIndex(path string, stride int, entries []kdxEntry) error {
|
|
f, err := os.Create(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
|
|
// Magic
|
|
if _, err := f.Write(kdxMagic[:]); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Stride (uint32 LE)
|
|
var buf4 [4]byte
|
|
binary.LittleEndian.PutUint32(buf4[:], uint32(stride))
|
|
if _, err := f.Write(buf4[:]); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Count (uint32 LE)
|
|
binary.LittleEndian.PutUint32(buf4[:], uint32(len(entries)))
|
|
if _, err := f.Write(buf4[:]); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Entries
|
|
var buf16 [16]byte
|
|
for _, e := range entries {
|
|
binary.LittleEndian.PutUint64(buf16[0:8], e.kmer)
|
|
binary.LittleEndian.PutUint64(buf16[8:16], e.offset)
|
|
if _, err := f.Write(buf16[:]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// KdxPathForKdi returns the .kdx path corresponding to a .kdi path.
|
|
func KdxPathForKdi(kdiPath string) string {
|
|
return strings.TrimSuffix(kdiPath, ".kdi") + ".kdx"
|
|
}
|