mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
256 lines
4.7 KiB
Go
256 lines
4.7 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"testing"
|
|
)
|
|
|
|
func TestKdiRoundTrip(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "test.kdi")
|
|
|
|
// Sorted k-mer values
|
|
kmers := []uint64{10, 20, 30, 100, 200, 500, 10000, 1 << 40, 1<<62 - 1}
|
|
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, v := range kmers {
|
|
if err := w.Write(v); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if w.Count() != uint64(len(kmers)) {
|
|
t.Fatalf("writer count: got %d, want %d", w.Count(), len(kmers))
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Read back
|
|
r, err := NewKdiReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
if r.Count() != uint64(len(kmers)) {
|
|
t.Fatalf("reader count: got %d, want %d", r.Count(), len(kmers))
|
|
}
|
|
|
|
for i, expected := range kmers {
|
|
got, ok := r.Next()
|
|
if !ok {
|
|
t.Fatalf("unexpected EOF at index %d", i)
|
|
}
|
|
if got != expected {
|
|
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
|
}
|
|
}
|
|
|
|
_, ok := r.Next()
|
|
if ok {
|
|
t.Fatal("expected EOF after all k-mers")
|
|
}
|
|
}
|
|
|
|
func TestKdiEmpty(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "empty.kdi")
|
|
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
r, err := NewKdiReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
if r.Count() != 0 {
|
|
t.Fatalf("expected count 0, got %d", r.Count())
|
|
}
|
|
|
|
_, ok := r.Next()
|
|
if ok {
|
|
t.Fatal("expected no k-mers in empty file")
|
|
}
|
|
}
|
|
|
|
func TestKdiSingleValue(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "single.kdi")
|
|
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Write(42); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
r, err := NewKdiReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
if r.Count() != 1 {
|
|
t.Fatalf("expected count 1, got %d", r.Count())
|
|
}
|
|
|
|
v, ok := r.Next()
|
|
if !ok {
|
|
t.Fatal("expected one k-mer")
|
|
}
|
|
if v != 42 {
|
|
t.Fatalf("got %d, want 42", v)
|
|
}
|
|
}
|
|
|
|
func TestKdiFileSize(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "size.kdi")
|
|
|
|
// Write: magic(4) + count(8) + first(8) = 20 bytes
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Write(0); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
// magic(4) + count(8) + first(8) = 20
|
|
if info.Size() != 20 {
|
|
t.Fatalf("file size: got %d, want 20", info.Size())
|
|
}
|
|
}
|
|
|
|
func TestKdiDeltaCompression(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "delta.kdi")
|
|
|
|
// Dense consecutive values should compress well
|
|
n := 10000
|
|
kmers := make([]uint64, n)
|
|
for i := range kmers {
|
|
kmers[i] = uint64(i * 2) // even numbers
|
|
}
|
|
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, v := range kmers {
|
|
if err := w.Write(v); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Each delta is 2, encoded as 1 byte varint
|
|
// Total: magic(4) + count(8) + first(8) + (n-1)*1 = 20 + 9999 bytes
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
expected := int64(20 + n - 1)
|
|
if info.Size() != expected {
|
|
t.Fatalf("file size: got %d, want %d", info.Size(), expected)
|
|
}
|
|
|
|
// Verify round-trip
|
|
r, err := NewKdiReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
for i, expected := range kmers {
|
|
got, ok := r.Next()
|
|
if !ok {
|
|
t.Fatalf("unexpected EOF at index %d", i)
|
|
}
|
|
if got != expected {
|
|
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestKdiFromRealKmers(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "real.kdi")
|
|
|
|
// Extract k-mers from a sequence, sort, dedup, write to KDI
|
|
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
|
k := 15
|
|
|
|
var kmers []uint64
|
|
for kmer := range IterCanonicalKmers(seq, k) {
|
|
kmers = append(kmers, kmer)
|
|
}
|
|
sort.Slice(kmers, func(i, j int) bool { return kmers[i] < kmers[j] })
|
|
// Dedup
|
|
deduped := kmers[:0]
|
|
for i, v := range kmers {
|
|
if i == 0 || v != kmers[i-1] {
|
|
deduped = append(deduped, v)
|
|
}
|
|
}
|
|
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, v := range deduped {
|
|
if err := w.Write(v); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Read back and verify
|
|
r, err := NewKdiReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
if r.Count() != uint64(len(deduped)) {
|
|
t.Fatalf("count: got %d, want %d", r.Count(), len(deduped))
|
|
}
|
|
|
|
for i, expected := range deduped {
|
|
got, ok := r.Next()
|
|
if !ok {
|
|
t.Fatalf("unexpected EOF at index %d", i)
|
|
}
|
|
if got != expected {
|
|
t.Fatalf("kmer %d: got %d, want %d", i, got, expected)
|
|
}
|
|
}
|
|
}
|