Refactor k-mer index building to use disk-based KmerSetGroupBuilder

Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations.

- Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder
- Add support for frequency filtering via WithMinFrequency option
- Remove deprecated k-mer set persistence methods
- Update CLI to use new builder approach
- Add new disk-based k-mer operations (union, intersect, difference, quorum)
- Introduce KDI (K-mer Delta Index) file format for efficient storage
- Add K-way merge operations for combining sorted k-mer streams
- Update documentation and examples to reflect new API

This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
Eric Coissac
2026-02-09 21:57:03 +01:00
parent a016ad5b8a
commit f78543ee75
33 changed files with 3291 additions and 3636 deletions

255
pkg/obikmer/kdi_test.go Normal file
View File

@@ -0,0 +1,255 @@
package obikmer
import (
"os"
"path/filepath"
"sort"
"testing"
)
func TestKdiRoundTrip(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "test.kdi")
// Sorted k-mer values
kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
for _, v := range kmers {
if err := w.Write(v); err != nil {
t.Fatal(err)
}
}
if w.Count() != uint64(len(kmers)) {
t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
// Read back
r, err := NewKdiReader(path)
if err != nil {
t.Fatal(err)
}
defer r.Close()
if r.Count() != uint64(len(kmers)) {
t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
}
for i, expected := range kmers {
got, ok := r.Next()
if !ok {
t.Fatalf("unexpected EOF at index %d", i)
}
if got != expected {
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
}
}
_, ok := r.Next()
if ok {
t.Fatal("expected EOF after all k-mers")
}
}
func TestKdiEmpty(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "empty.kdi")
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
r, err := NewKdiReader(path)
if err != nil {
t.Fatal(err)
}
defer r.Close()
if r.Count() != 0 {
t.Fatalf("expected count 0, got %d", r.Count())
}
_, ok := r.Next()
if ok {
t.Fatal("expected no k-mers in empty file")
}
}
func TestKdiSingleValue(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "single.kdi")
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
if err := w.Write(42); err != nil {
t.Fatal(err)
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
r, err := NewKdiReader(path)
if err != nil {
t.Fatal(err)
}
defer r.Close()
if r.Count() != 1 {
t.Fatalf("expected count 1, got %d", r.Count())
}
v, ok := r.Next()
if !ok {
t.Fatal("expected one k-mer")
}
if v != 42 {
t.Fatalf("got %d, want 42", v)
}
}
func TestKdiFileSize(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "size.kdi")
// Write: magic(4) + count(8) + first(8) = 20 bytes
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
if err := w.Write(0); err != nil {
t.Fatal(err)
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
info, err := os.Stat(path)
if err != nil {
t.Fatal(err)
}
// magic(4) + count(8) + first(8) = 20
if info.Size() != 20 {
t.Fatalf("file size: got %d, want 20", info.Size())
}
}
func TestKdiDeltaCompression(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "delta.kdi")
// Dense consecutive values should compress well
n := 10000
kmers := make([]uint64, n)
for i := range kmers {
kmers[i] = uint64(i * 2) // even numbers
}
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
for _, v := range kmers {
if err := w.Write(v); err != nil {
t.Fatal(err)
}
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
// Each delta is 2, encoded as 1 byte varint
// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
info, err := os.Stat(path)
if err != nil {
t.Fatal(err)
}
expected := int64(20 + n - 1)
if info.Size() != expected {
t.Fatalf("file size: got %d, want %d", info.Size(), expected)
}
// Verify round-trip
r, err := NewKdiReader(path)
if err != nil {
t.Fatal(err)
}
defer r.Close()
for i, expected := range kmers {
got, ok := r.Next()
if !ok {
t.Fatalf("unexpected EOF at index %d", i)
}
if got != expected {
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
}
}
}
func TestKdiFromRealKmers(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "real.kdi")
// Extract k-mers from a sequence, sort, dedup, write to KDI
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
k := 15
var kmers []uint64
for kmer := range IterCanonicalKmers(seq, k) {
kmers = append(kmers, kmer)
}
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
// Dedup
deduped := kmers[:0]
for i, v := range kmers {
if i == 0 || v != kmers[i-1] {
deduped = append(deduped, v)
}
}
w, err := NewKdiWriter(path)
if err != nil {
t.Fatal(err)
}
for _, v := range deduped {
if err := w.Write(v); err != nil {
t.Fatal(err)
}
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
// Read back and verify
r, err := NewKdiReader(path)
if err != nil {
t.Fatal(err)
}
defer r.Close()
if r.Count() != uint64(len(deduped)) {
t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
}
for i, expected := range deduped {
got, ok := r.Next()
if !ok {
t.Fatalf("unexpected EOF at index %d", i)
}
if got != expected {
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
}
}
}