Files
obitools4/pkg/obikmer/kmer_set_disk_ops_test.go
Eric Coissac f78543ee75 Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations.

- Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder
- Add support for frequency filtering via WithMinFrequency option
- Remove deprecated k-mer set persistence methods
- Update CLI to use new builder approach
- Add new disk-based k-mer operations (union, intersect, difference, quorum)
- Introduce KDI (K-mer Delta Index) file format for efficient storage
- Add K-way merge operations for combining sorted k-mer streams
- Update documentation and examples to reflect new API

This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
2026-02-10 06:49:31 +01:00

252 lines
6.2 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package obikmer
import (
"path/filepath"
"testing"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
// buildGroupFromSeqs creates a KmerSetGroup with one set per sequence.
func buildGroupFromSeqs(t *testing.T, dir string, k, m int, seqs []string) *KmerSetGroup {
t.Helper()
n := len(seqs)
builder, err := NewKmerSetGroupBuilder(dir, k, m, n, 64)
if err != nil {
t.Fatal(err)
}
for i, s := range seqs {
seq := obiseq.NewBioSequence("", []byte(s), "")
builder.AddSequence(i, seq)
}
ksg, err := builder.Close()
if err != nil {
t.Fatal(err)
}
return ksg
}
func collectKmers(t *testing.T, ksg *KmerSetGroup, setIdx int) []uint64 {
t.Helper()
var result []uint64
for kmer := range ksg.Iterator(setIdx) {
result = append(result, kmer)
}
return result
}
func TestDiskOpsUnion(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
outDir := filepath.Join(dir, "union")
// Two sequences with some overlap
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
result, err := ksg.Union(outDir)
if err != nil {
t.Fatal(err)
}
// Union should have at least as many k-mers as each individual set
unionLen := result.Len(0)
if unionLen == 0 {
t.Fatal("union is empty")
}
if unionLen < ksg.Len(0) || unionLen < ksg.Len(1) {
t.Fatalf("union (%d) smaller than an input set (%d, %d)", unionLen, ksg.Len(0), ksg.Len(1))
}
// Union should not exceed the sum of both sets
if unionLen > ksg.Len(0)+ksg.Len(1) {
t.Fatalf("union (%d) larger than sum of sets (%d)", unionLen, ksg.Len(0)+ksg.Len(1))
}
}
func TestDiskOpsIntersect(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
outDir := filepath.Join(dir, "intersect")
// Two sequences with some shared k-mers
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
result, err := ksg.Intersect(outDir)
if err != nil {
t.Fatal(err)
}
interLen := result.Len(0)
// Intersection should not be bigger than any individual set
if interLen > ksg.Len(0) || interLen > ksg.Len(1) {
t.Fatalf("intersection (%d) larger than input sets (%d, %d)", interLen, ksg.Len(0), ksg.Len(1))
}
}
func TestDiskOpsDifference(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
outDir := filepath.Join(dir, "diff")
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
result, err := ksg.Difference(outDir)
if err != nil {
t.Fatal(err)
}
diffLen := result.Len(0)
// Difference = set_0 - set_1, so should be <= set_0
if diffLen > ksg.Len(0) {
t.Fatalf("difference (%d) larger than set_0 (%d)", diffLen, ksg.Len(0))
}
}
func TestDiskOpsConsistency(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
unionResult, err := ksg.Union(filepath.Join(dir, "union"))
if err != nil {
t.Fatal(err)
}
interResult, err := ksg.Intersect(filepath.Join(dir, "intersect"))
if err != nil {
t.Fatal(err)
}
diffResult, err := ksg.Difference(filepath.Join(dir, "diff"))
if err != nil {
t.Fatal(err)
}
unionLen := unionResult.Len(0)
interLen := interResult.Len(0)
diffLen := diffResult.Len(0)
// |A B| = |A| + |B| - |A ∩ B|
expectedUnion := ksg.Len(0) + ksg.Len(1) - interLen
if unionLen != expectedUnion {
t.Fatalf("|AB|=%d, expected |A|+|B|-|A∩B|=%d+%d-%d=%d",
unionLen, ksg.Len(0), ksg.Len(1), interLen, expectedUnion)
}
// |A \ B| = |A| - |A ∩ B|
expectedDiff := ksg.Len(0) - interLen
if diffLen != expectedDiff {
t.Fatalf("|A\\B|=%d, expected |A|-|A∩B|=%d-%d=%d",
diffLen, ksg.Len(0), interLen, expectedDiff)
}
}
func TestDiskOpsQuorum(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
// Three sets
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"CTAGCTAGCTGATCGATCGATCGTTTAAACCC",
"GATCGATCGATCGAAATTTCCCGGG",
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
// QuorumAtLeast(1) = Union
q1, err := ksg.QuorumAtLeast(1, filepath.Join(dir, "q1"))
if err != nil {
t.Fatal(err)
}
union, err := ksg.Union(filepath.Join(dir, "union"))
if err != nil {
t.Fatal(err)
}
if q1.Len(0) != union.Len(0) {
t.Fatalf("QuorumAtLeast(1)=%d != Union=%d", q1.Len(0), union.Len(0))
}
// QuorumAtLeast(3) = Intersect
q3, err := ksg.QuorumAtLeast(3, filepath.Join(dir, "q3"))
if err != nil {
t.Fatal(err)
}
inter, err := ksg.Intersect(filepath.Join(dir, "inter"))
if err != nil {
t.Fatal(err)
}
if q3.Len(0) != inter.Len(0) {
t.Fatalf("QuorumAtLeast(3)=%d != Intersect=%d", q3.Len(0), inter.Len(0))
}
// QuorumAtLeast(2) should be between Intersect and Union
q2, err := ksg.QuorumAtLeast(2, filepath.Join(dir, "q2"))
if err != nil {
t.Fatal(err)
}
if q2.Len(0) < q3.Len(0) || q2.Len(0) > q1.Len(0) {
t.Fatalf("QuorumAtLeast(2)=%d not between intersect=%d and union=%d",
q2.Len(0), q3.Len(0), q1.Len(0))
}
}
func TestDiskOpsJaccard(t *testing.T) {
dir := t.TempDir()
indexDir := filepath.Join(dir, "index")
seqs := []string{
"ACGATCGATCTAGCTAGCTGATCGATCGATCG",
"ACGATCGATCTAGCTAGCTGATCGATCGATCG", // identical to first
"TTTTTTTTTTTTTTTTTTTTTTTTT", // completely different
}
ksg := buildGroupFromSeqs(t, indexDir, 15, 7, seqs)
dm := ksg.JaccardDistanceMatrix()
if dm == nil {
t.Fatal("JaccardDistanceMatrix returned nil")
}
// Identical sets should have distance 0
d01 := dm.Get(0, 1)
if d01 != 0.0 {
t.Fatalf("distance(0,1) = %f, expected 0.0 for identical sets", d01)
}
// Completely different sets should have distance 1.0
d02 := dm.Get(0, 2)
if d02 != 1.0 {
t.Fatalf("distance(0,2) = %f, expected 1.0 for disjoint sets", d02)
}
// Similarity matrix
sm := ksg.JaccardSimilarityMatrix()
if sm == nil {
t.Fatal("JaccardSimilarityMatrix returned nil")
}
s01 := sm.Get(0, 1)
if s01 != 1.0 {
t.Fatalf("similarity(0,1) = %f, expected 1.0 for identical sets", s01)
}
s02 := sm.Get(0, 2)
if s02 != 0.0 {
t.Fatalf("similarity(0,2) = %f, expected 0.0 for disjoint sets", s02)
}
}