mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include: - New KdxIndex struct and related functions for loading, searching, and writing .kdx files - Modified KdiReader to support seeking with the new index - Updated KdiWriter to create .kdx index files during writing - Enhanced KmerSetGroup.Contains to use the new index for faster lookups - Added a new 'match' command to annotate sequences with k-mer match positions The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
152 lines
3.7 KiB
Go
152 lines
3.7 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"bufio"
|
|
"encoding/binary"
|
|
"os"
|
|
)
|
|
|
|
// KDI file magic bytes: "KDI\x01"
|
|
var kdiMagic = [4]byte{'K', 'D', 'I', 0x01}
|
|
|
|
// kdiHeaderSize is the size of the KDI header: magic(4) + count(8) = 12 bytes.
|
|
const kdiHeaderSize = 12
|
|
|
|
// KdiWriter writes a sorted sequence of uint64 k-mers to a .kdi file
|
|
// using delta-varint encoding.
|
|
//
|
|
// Format:
|
|
//
|
|
// [magic: 4 bytes "KDI\x01"]
|
|
// [count: uint64 LE] number of k-mers
|
|
// [first: uint64 LE] first k-mer (absolute value)
|
|
// [delta_1: varint] arr[1] - arr[0]
|
|
// [delta_2: varint] arr[2] - arr[1]
|
|
// ...
|
|
//
|
|
// The caller must write k-mers in strictly increasing order.
|
|
//
|
|
// On Close(), a companion .kdx sparse index file is written alongside
|
|
// the .kdi file for fast random access.
|
|
type KdiWriter struct {
|
|
w *bufio.Writer
|
|
file *os.File
|
|
count uint64
|
|
prev uint64
|
|
first bool
|
|
path string
|
|
bytesWritten uint64 // bytes written after header (data section offset)
|
|
indexEntries []kdxEntry // sparse index entries collected during writes
|
|
}
|
|
|
|
// NewKdiWriter creates a new KdiWriter writing to the given file path.
|
|
// The header (magic + count placeholder) is written immediately.
|
|
// Count is patched on Close().
|
|
func NewKdiWriter(path string) (*KdiWriter, error) {
|
|
f, err := os.Create(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
w := bufio.NewWriterSize(f, 65536)
|
|
|
|
// Write magic
|
|
if _, err := w.Write(kdiMagic[:]); err != nil {
|
|
f.Close()
|
|
return nil, err
|
|
}
|
|
// Write placeholder for count (will be patched on Close)
|
|
var countBuf [8]byte
|
|
if _, err := w.Write(countBuf[:]); err != nil {
|
|
f.Close()
|
|
return nil, err
|
|
}
|
|
|
|
return &KdiWriter{
|
|
w: w,
|
|
file: f,
|
|
first: true,
|
|
path: path,
|
|
bytesWritten: 0,
|
|
indexEntries: make([]kdxEntry, 0, 256),
|
|
}, nil
|
|
}
|
|
|
|
// Write adds a k-mer to the file. K-mers must be written in strictly
|
|
// increasing order.
|
|
func (kw *KdiWriter) Write(kmer uint64) error {
|
|
if kw.first {
|
|
// Write first value as absolute uint64 LE
|
|
var buf [8]byte
|
|
binary.LittleEndian.PutUint64(buf[:], kmer)
|
|
if _, err := kw.w.Write(buf[:]); err != nil {
|
|
return err
|
|
}
|
|
kw.bytesWritten += 8
|
|
kw.prev = kmer
|
|
kw.first = false
|
|
} else {
|
|
delta := kmer - kw.prev
|
|
n, err := EncodeVarint(kw.w, delta)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
kw.bytesWritten += uint64(n)
|
|
kw.prev = kmer
|
|
}
|
|
kw.count++
|
|
|
|
// Record sparse index entry every defaultKdxStride k-mers.
|
|
// The offset recorded is AFTER writing this k-mer, so it points to
|
|
// where the next k-mer's data will start. SeekTo uses this: it seeks
|
|
// to the recorded offset, sets prev = indexedKmer, and Next() reads
|
|
// the delta of the following k-mer.
|
|
if kw.count%defaultKdxStride == 0 {
|
|
kw.indexEntries = append(kw.indexEntries, kdxEntry{
|
|
kmer: kmer,
|
|
offset: kdiHeaderSize + kw.bytesWritten,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Count returns the number of k-mers written so far.
|
|
func (kw *KdiWriter) Count() uint64 {
|
|
return kw.count
|
|
}
|
|
|
|
// Close flushes buffered data, patches the count in the header,
|
|
// writes the companion .kdx index file, and closes the file.
|
|
func (kw *KdiWriter) Close() error {
|
|
if err := kw.w.Flush(); err != nil {
|
|
kw.file.Close()
|
|
return err
|
|
}
|
|
|
|
// Patch count at offset 4 (after magic)
|
|
if _, err := kw.file.Seek(4, 0); err != nil {
|
|
kw.file.Close()
|
|
return err
|
|
}
|
|
var countBuf [8]byte
|
|
binary.LittleEndian.PutUint64(countBuf[:], kw.count)
|
|
if _, err := kw.file.Write(countBuf[:]); err != nil {
|
|
kw.file.Close()
|
|
return err
|
|
}
|
|
|
|
if err := kw.file.Close(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Write .kdx index file if there are entries to index
|
|
if len(kw.indexEntries) > 0 {
|
|
kdxPath := KdxPathForKdi(kw.path)
|
|
if err := WriteKdxIndex(kdxPath, defaultKdxStride, kw.indexEntries); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|