mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor k-mer index building to use disk-based KmerSetGroupBuilder
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
This commit is contained in:
176
pkg/obikmer/skm_test.go
Normal file
176
pkg/obikmer/skm_test.go
Normal file
@@ -0,0 +1,176 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSkmRoundTrip(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "test.skm")
|
||||
|
||||
// Create super-kmers from a known sequence
|
||||
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
||||
k := 21
|
||||
m := 9
|
||||
superKmers := ExtractSuperKmers(seq, k, m, nil)
|
||||
if len(superKmers) == 0 {
|
||||
t.Fatal("no super-kmers extracted")
|
||||
}
|
||||
|
||||
// Write
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, sk := range superKmers {
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
idx := 0
|
||||
for {
|
||||
sk, ok := r.Next()
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
if idx >= len(superKmers) {
|
||||
t.Fatal("read more super-kmers than written")
|
||||
}
|
||||
expected := superKmers[idx]
|
||||
if len(sk.Sequence) != len(expected.Sequence) {
|
||||
t.Fatalf("super-kmer %d: length mismatch: got %d, want %d",
|
||||
idx, len(sk.Sequence), len(expected.Sequence))
|
||||
}
|
||||
// Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase)
|
||||
for j := range sk.Sequence {
|
||||
got := sk.Sequence[j] | 0x20
|
||||
want := expected.Sequence[j] | 0x20
|
||||
if got != want {
|
||||
t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want)
|
||||
}
|
||||
}
|
||||
idx++
|
||||
}
|
||||
if idx != len(superKmers) {
|
||||
t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmEmptyFile(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "empty.skm")
|
||||
|
||||
// Write nothing
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Read back
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
_, ok := r.Next()
|
||||
if ok {
|
||||
t.Fatal("expected no super-kmers in empty file")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmSingleBase(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "single.skm")
|
||||
|
||||
// Test with sequences of various lengths to check padding
|
||||
sequences := [][]byte{
|
||||
[]byte("A"),
|
||||
[]byte("AC"),
|
||||
[]byte("ACG"),
|
||||
[]byte("ACGT"),
|
||||
[]byte("ACGTA"),
|
||||
}
|
||||
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, seq := range sequences {
|
||||
sk := SuperKmer{Sequence: seq}
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
r, err := NewSkmReader(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer r.Close()
|
||||
|
||||
for i, expected := range sequences {
|
||||
sk, ok := r.Next()
|
||||
if !ok {
|
||||
t.Fatalf("expected super-kmer %d, got EOF", i)
|
||||
}
|
||||
if len(sk.Sequence) != len(expected) {
|
||||
t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected))
|
||||
}
|
||||
for j := range sk.Sequence {
|
||||
got := sk.Sequence[j] | 0x20
|
||||
want := expected[j] | 0x20
|
||||
if got != want {
|
||||
t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSkmFileSize(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "size.skm")
|
||||
|
||||
// Write a sequence of known length
|
||||
seq := []byte("ACGTACGTAC") // 10 bases
|
||||
sk := SuperKmer{Sequence: seq}
|
||||
|
||||
w, err := NewSkmWriter(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Write(sk); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if info.Size() != 5 {
|
||||
t.Fatalf("file size: got %d, want 5", info.Size())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user