Refactor k-mer index building to use disk-based KmerSetGroupBuilder

Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations.

- Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder
- Add support for frequency filtering via WithMinFrequency option
- Remove deprecated k-mer set persistence methods
- Update CLI to use new builder approach
- Add new disk-based k-mer operations (union, intersect, difference, quorum)
- Introduce KDI (K-mer Delta Index) file format for efficient storage
- Add K-way merge operations for combining sorted k-mer streams
- Update documentation and examples to reflect new API

This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
Eric Coissac
2026-02-09 21:57:03 +01:00
parent a016ad5b8a
commit f78543ee75
33 changed files with 3291 additions and 3636 deletions

67
pkg/obikmer/skm_reader.go Normal file
View File

@@ -0,0 +1,67 @@
package obikmer
import (
"bufio"
"encoding/binary"
"io"
"os"
)
// decode2bit maps 2-bit codes back to nucleotide bytes.
var decode2bit = [4]byte{'a', 'c', 'g', 't'}
// SkmReader reads super-kmers from a binary .skm file.
type SkmReader struct {
r *bufio.Reader
file *os.File
}
// NewSkmReader opens a .skm file for reading.
func NewSkmReader(path string) (*SkmReader, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
return &SkmReader{
r: bufio.NewReaderSize(f, 65536),
file: f,
}, nil
}
// Next reads the next super-kmer from the file.
// Returns the SuperKmer and true, or a zero SuperKmer and false at EOF.
func (sr *SkmReader) Next() (SuperKmer, bool) {
// Read length
var lenbuf [2]byte
if _, err := io.ReadFull(sr.r, lenbuf[:]); err != nil {
return SuperKmer{}, false
}
seqLen := int(binary.LittleEndian.Uint16(lenbuf[:]))
// Read packed bytes
nBytes := (seqLen + 3) / 4
packed := make([]byte, nBytes)
if _, err := io.ReadFull(sr.r, packed); err != nil {
return SuperKmer{}, false
}
// Decode to nucleotide bytes
seq := make([]byte, seqLen)
for i := 0; i < seqLen; i++ {
byteIdx := i / 4
bitPos := uint(6 - (i%4)*2)
code := (packed[byteIdx] >> bitPos) & 0x03
seq[i] = decode2bit[code]
}
return SuperKmer{
Sequence: seq,
Start: 0,
End: seqLen,
}, true
}
// Close closes the underlying file.
func (sr *SkmReader) Close() error {
return sr.file.Close()
}