mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
255
pkg/obikmer/kdi_test.go
Normal file
255
pkg/obikmer/kdi_test.go
Normal file
@@ -0,0 +1,255 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestKdiRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "test.kdi")
|
||||
|
||||
// Sorted k-mer values
|
||||
kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range kmers {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if w.Count() != uint64(len(kmers)) {
|
||||
t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != uint64(len(kmers)) {
|
||||
t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
|
||||
}
|
||||
|
||||
for i, expected := range kmers {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected EOF after all k-mers")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiEmpty(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "empty.kdi")
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != 0 {
|
||||
t.Fatalf("expected count 0, got %d", r.Count())
|
||||
}
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected no k-mers in empty file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiSingleValue(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "single.kdi")
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(42); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != 1 {
|
||||
t.Fatalf("expected count 1, got %d", r.Count())
|
||||
}
|
||||
|
||||
v, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatal("expected one k-mer")
|
||||
}
|
||||
if v != 42 {
|
||||
t.Fatalf("got %d, want 42", v)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiFileSize(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "size.kdi")
|
||||
|
||||
// Write: magic(4) + count(8) + first(8) = 20 bytes
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(0); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
// magic(4) + count(8) + first(8) = 20
|
||||
if info.Size() != 20 {
|
||||
t.Fatalf("file size: got %d, want 20", info.Size())
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiDeltaCompression(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "delta.kdi")
|
||||
|
||||
// Dense consecutive values should compress well
|
||||
n := 10000
|
||||
kmers := make([]uint64, n)
|
||||
for i := range kmers {
|
||||
kmers[i] = uint64(i * 2) // even numbers
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range kmers {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Each delta is 2, encoded as 1 byte varint
|
||||
// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
expected := int64(20 + n - 1)
|
||||
if info.Size() != expected {
|
||||
t.Fatalf("file size: got %d, want %d", info.Size(), expected)
|
||||
}
|
||||
|
||||
// Verify round-trip
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
for i, expected := range kmers {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestKdiFromRealKmers(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "real.kdi")
|
||||
|
||||
// Extract k-mers from a sequence, sort, dedup, write to KDI
|
||||
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
||||
k := 15
|
||||
|
||||
var kmers []uint64
|
||||
for kmer := range IterCanonicalKmers(seq, k) {
|
||||
kmers = append(kmers, kmer)
|
||||
}
|
||||
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
|
||||
// Dedup
|
||||
deduped := kmers[:0]
|
||||
for i, v := range kmers {
|
||||
if i == 0 || v != kmers[i-1] {
|
||||
deduped = append(deduped, v)
|
||||
}
|
||||
}
|
||||
|
||||
w, err := NewKdiWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, v := range deduped {
|
||||
if err := w.Write(v); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back and verify
|
||||
r, err := NewKdiReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
if r.Count() != uint64(len(deduped)) {
|
||||
t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
|
||||
}
|
||||
|
||||
for i, expected := range deduped {
|
||||
got, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("unexpected EOF at index %d", i)
|
||||
}
|
||||
if got != expected {
|
||||
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user