mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
160 lines
3.3 KiB
Go
160 lines
3.3 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
// writeKdi is a helper that writes sorted kmers to a .kdi file.
|
|
func writeKdi(t *testing.T, dir, name string, kmers []uint64) string {
|
|
t.Helper()
|
|
path := filepath.Join(dir, name)
|
|
w, err := NewKdiWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, v := range kmers {
|
|
if err := w.Write(v); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
return path
|
|
}
|
|
|
|
func TestKWayMergeBasic(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
// Three sorted streams
|
|
p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 3, 5, 7})
|
|
p2 := writeKdi(t, dir, "b.kdi", []uint64{2, 3, 6, 7})
|
|
p3 := writeKdi(t, dir, "c.kdi", []uint64{3, 4, 7, 8})
|
|
|
|
r1, _ := NewKdiReader(p1)
|
|
r2, _ := NewKdiReader(p2)
|
|
r3, _ := NewKdiReader(p3)
|
|
|
|
m := NewKWayMerge([]*KdiReader{r1, r2, r3})
|
|
defer m.Close()
|
|
|
|
type result struct {
|
|
kmer uint64
|
|
count int
|
|
}
|
|
var results []result
|
|
for {
|
|
kmer, count, ok := m.Next()
|
|
if !ok {
|
|
break
|
|
}
|
|
results = append(results, result{kmer, count})
|
|
}
|
|
|
|
expected := []result{
|
|
{1, 1}, {2, 1}, {3, 3}, {4, 1}, {5, 1}, {6, 1}, {7, 3}, {8, 1},
|
|
}
|
|
if len(results) != len(expected) {
|
|
t.Fatalf("got %d results, want %d", len(results), len(expected))
|
|
}
|
|
for i, exp := range expected {
|
|
if results[i] != exp {
|
|
t.Errorf("result %d: got %+v, want %+v", i, results[i], exp)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestKWayMergeSingleStream(t *testing.T) {
|
|
dir := t.TempDir()
|
|
p := writeKdi(t, dir, "a.kdi", []uint64{10, 20, 30})
|
|
|
|
r, _ := NewKdiReader(p)
|
|
m := NewKWayMerge([]*KdiReader{r})
|
|
defer m.Close()
|
|
|
|
vals := []uint64{10, 20, 30}
|
|
for _, expected := range vals {
|
|
kmer, count, ok := m.Next()
|
|
if !ok {
|
|
t.Fatal("unexpected EOF")
|
|
}
|
|
if kmer != expected || count != 1 {
|
|
t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, expected)
|
|
}
|
|
}
|
|
_, _, ok := m.Next()
|
|
if ok {
|
|
t.Fatal("expected EOF")
|
|
}
|
|
}
|
|
|
|
func TestKWayMergeEmpty(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
p1 := writeKdi(t, dir, "a.kdi", nil)
|
|
p2 := writeKdi(t, dir, "b.kdi", nil)
|
|
|
|
r1, _ := NewKdiReader(p1)
|
|
r2, _ := NewKdiReader(p2)
|
|
|
|
m := NewKWayMerge([]*KdiReader{r1, r2})
|
|
defer m.Close()
|
|
|
|
_, _, ok := m.Next()
|
|
if ok {
|
|
t.Fatal("expected no results from empty streams")
|
|
}
|
|
}
|
|
|
|
func TestKWayMergeDisjoint(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
p1 := writeKdi(t, dir, "a.kdi", []uint64{1, 2, 3})
|
|
p2 := writeKdi(t, dir, "b.kdi", []uint64{10, 20, 30})
|
|
|
|
r1, _ := NewKdiReader(p1)
|
|
r2, _ := NewKdiReader(p2)
|
|
|
|
m := NewKWayMerge([]*KdiReader{r1, r2})
|
|
defer m.Close()
|
|
|
|
expected := []uint64{1, 2, 3, 10, 20, 30}
|
|
for _, exp := range expected {
|
|
kmer, count, ok := m.Next()
|
|
if !ok {
|
|
t.Fatal("unexpected EOF")
|
|
}
|
|
if kmer != exp || count != 1 {
|
|
t.Fatalf("got (%d, %d), want (%d, 1)", kmer, count, exp)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestKWayMergeAllSame(t *testing.T) {
|
|
dir := t.TempDir()
|
|
|
|
p1 := writeKdi(t, dir, "a.kdi", []uint64{42})
|
|
p2 := writeKdi(t, dir, "b.kdi", []uint64{42})
|
|
p3 := writeKdi(t, dir, "c.kdi", []uint64{42})
|
|
|
|
r1, _ := NewKdiReader(p1)
|
|
r2, _ := NewKdiReader(p2)
|
|
r3, _ := NewKdiReader(p3)
|
|
|
|
m := NewKWayMerge([]*KdiReader{r1, r2, r3})
|
|
defer m.Close()
|
|
|
|
kmer, count, ok := m.Next()
|
|
if !ok {
|
|
t.Fatal("expected one result")
|
|
}
|
|
if kmer != 42 || count != 3 {
|
|
t.Fatalf("got (%d, %d), want (42, 3)", kmer, count)
|
|
}
|
|
_, _, ok = m.Next()
|
|
if ok {
|
|
t.Fatal("expected EOF")
|
|
}
|
|
}
|