mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Add sparse index support for KDI files with fast seeking
This commit introduces sparse index support for KDI files to enable fast random access during k-mer matching. It adds a new .kdx index file format and updates the KDI reader and writer to handle index creation and seeking. The changes include: - New KdxIndex struct and related functions for loading, searching, and writing .kdx files - Modified KdiReader to support seeking with the new index - Updated KdiWriter to create .kdx index files during writing - Enhanced KmerSetGroup.Contains to use the new index for faster lookups - Added a new 'match' command to annotate sequences with k-mer match positions The index is created automatically during KDI file creation and allows for O(log N / stride) binary search followed by at most stride linear scan steps, significantly improving performance for large datasets.
This commit is contained in:
190
pkg/obikmer/kmer_match.go
Normal file
190
pkg/obikmer/kmer_match.go
Normal file
@@ -0,0 +1,190 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
)
|
||||
|
||||
// QueryEntry represents a canonical k-mer to look up, together with
|
||||
// metadata to trace the result back to the originating sequence and position.
|
||||
type QueryEntry struct {
|
||||
Kmer uint64 // canonical k-mer value
|
||||
SeqIdx int // index within the batch
|
||||
Pos int // 0-based position in the sequence
|
||||
}
|
||||
|
||||
// MatchResult maps sequence index → sorted slice of matched positions.
|
||||
type MatchResult map[int][]int
|
||||
|
||||
// seqMatchResult collects matched positions for a single sequence.
|
||||
type seqMatchResult struct {
|
||||
mu sync.Mutex
|
||||
positions []int
|
||||
}
|
||||
|
||||
// PrepareQueries extracts all canonical k-mers from a batch of sequences
|
||||
// and groups them by partition using super-kmer minimizers.
|
||||
//
|
||||
// Returns queries[partition] where each slice is sorted by Kmer value.
|
||||
func (ksg *KmerSetGroup) PrepareQueries(sequences []*obiseq.BioSequence) [][]QueryEntry {
|
||||
P := ksg.partitions
|
||||
k := ksg.k
|
||||
m := ksg.m
|
||||
|
||||
// Pre-allocate partition buckets
|
||||
buckets := make([][]QueryEntry, P)
|
||||
for i := range buckets {
|
||||
buckets[i] = make([]QueryEntry, 0, 64)
|
||||
}
|
||||
|
||||
for seqIdx, seq := range sequences {
|
||||
bseq := seq.Sequence()
|
||||
if len(bseq) < k {
|
||||
continue
|
||||
}
|
||||
|
||||
// Iterate super-kmers to get minimizer → partition mapping
|
||||
for sk := range IterSuperKmers(bseq, k, m) {
|
||||
partition := int(sk.Minimizer % uint64(P))
|
||||
|
||||
// Iterate canonical k-mers within this super-kmer
|
||||
skSeq := sk.Sequence
|
||||
if len(skSeq) < k {
|
||||
continue
|
||||
}
|
||||
|
||||
localPos := 0
|
||||
for kmer := range IterCanonicalKmers(skSeq, k) {
|
||||
buckets[partition] = append(buckets[partition], QueryEntry{
|
||||
Kmer: kmer,
|
||||
SeqIdx: seqIdx,
|
||||
Pos: sk.Start + localPos,
|
||||
})
|
||||
localPos++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sort each bucket by k-mer value for merge-scan
|
||||
for p := range buckets {
|
||||
sort.Slice(buckets[p], func(i, j int) bool {
|
||||
return buckets[p][i].Kmer < buckets[p][j].Kmer
|
||||
})
|
||||
}
|
||||
|
||||
return buckets
|
||||
}
|
||||
|
||||
// MatchBatch looks up pre-sorted queries against one set of the index.
|
||||
// Partitions are processed in parallel. For each partition, a merge-scan
|
||||
// compares the sorted queries against the sorted KDI stream.
|
||||
//
|
||||
// Returns a MatchResult mapping sequence index to sorted matched positions.
|
||||
func (ksg *KmerSetGroup) MatchBatch(setIndex int, queries [][]QueryEntry) MatchResult {
|
||||
P := ksg.partitions
|
||||
|
||||
// Per-sequence result collectors
|
||||
var resultMu sync.Mutex
|
||||
resultMap := make(map[int]*seqMatchResult)
|
||||
|
||||
getResult := func(seqIdx int) *seqMatchResult {
|
||||
resultMu.Lock()
|
||||
sr, ok := resultMap[seqIdx]
|
||||
if !ok {
|
||||
sr = &seqMatchResult{}
|
||||
resultMap[seqIdx] = sr
|
||||
}
|
||||
resultMu.Unlock()
|
||||
return sr
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for p := 0; p < P; p++ {
|
||||
if len(queries[p]) == 0 {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(part int) {
|
||||
defer wg.Done()
|
||||
ksg.matchPartition(setIndex, part, queries[part], getResult)
|
||||
}(p)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
// Build final result with sorted positions
|
||||
result := make(MatchResult, len(resultMap))
|
||||
for seqIdx, sr := range resultMap {
|
||||
if len(sr.positions) > 0 {
|
||||
sort.Ints(sr.positions)
|
||||
result[seqIdx] = sr.positions
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// matchPartition processes one partition: opens the KDI reader (with index),
|
||||
// seeks to the first query, then merge-scans queries against the KDI stream.
|
||||
func (ksg *KmerSetGroup) matchPartition(
|
||||
setIndex int,
|
||||
partIndex int,
|
||||
queries []QueryEntry, // sorted by Kmer
|
||||
getResult func(int) *seqMatchResult,
|
||||
) {
|
||||
r, err := NewKdiIndexedReader(ksg.partitionPath(setIndex, partIndex))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() == 0 || len(queries) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Seek to the first query's neighborhood
|
||||
if err := r.SeekTo(queries[0].Kmer); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Read first kmer from the stream after seek
|
||||
currentKmer, ok := r.Next()
|
||||
if !ok {
|
||||
return
|
||||
}
|
||||
|
||||
qi := 0 // query index
|
||||
|
||||
for qi < len(queries) {
|
||||
q := queries[qi]
|
||||
|
||||
// Advance KDI stream until >= query kmer
|
||||
for currentKmer < q.Kmer {
|
||||
currentKmer, ok = r.Next()
|
||||
if !ok {
|
||||
return // KDI exhausted
|
||||
}
|
||||
}
|
||||
|
||||
if currentKmer == q.Kmer {
|
||||
// Match! Record all queries with this same k-mer value
|
||||
matchedKmer := q.Kmer
|
||||
for qi < len(queries) && queries[qi].Kmer == matchedKmer {
|
||||
sr := getResult(queries[qi].SeqIdx)
|
||||
sr.mu.Lock()
|
||||
sr.positions = append(sr.positions, queries[qi].Pos)
|
||||
sr.mu.Unlock()
|
||||
qi++
|
||||
}
|
||||
} else {
|
||||
// currentKmer > q.Kmer: skip all queries with this kmer value
|
||||
skippedKmer := q.Kmer
|
||||
for qi < len(queries) && queries[qi].Kmer == skippedKmer {
|
||||
qi++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user