mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Refactor k-mer index building to use the new disk-based KmerSetGroupBuilder instead of the old KmerSet and FrequencyFilter approaches. This change introduces a more efficient and scalable approach to building k-mer indices by using partitioned disk storage with streaming operations. - Replace BuildKmerIndex and BuildFrequencyFilterIndex with KmerSetGroupBuilder - Add support for frequency filtering via WithMinFrequency option - Remove deprecated k-mer set persistence methods - Update CLI to use new builder approach - Add new disk-based k-mer operations (union, intersect, difference, quorum) - Introduce KDI (K-mer Delta Index) file format for efficient storage - Add K-way merge operations for combining sorted k-mer streams - Update documentation and examples to reflect new API This refactoring provides better memory usage, faster operations on large datasets, and more flexible k-mer set operations.
177 lines
3.4 KiB
Go
177 lines
3.4 KiB
Go
package obikmer
|
|
|
|
import (
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
)
|
|
|
|
func TestSkmRoundTrip(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "test.skm")
|
|
|
|
// Create super-kmers from a known sequence
|
|
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT")
|
|
k := 21
|
|
m := 9
|
|
superKmers := ExtractSuperKmers(seq, k, m, nil)
|
|
if len(superKmers) == 0 {
|
|
t.Fatal("no super-kmers extracted")
|
|
}
|
|
|
|
// Write
|
|
w, err := NewSkmWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, sk := range superKmers {
|
|
if err := w.Write(sk); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Read back
|
|
r, err := NewSkmReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
idx := 0
|
|
for {
|
|
sk, ok := r.Next()
|
|
if !ok {
|
|
break
|
|
}
|
|
if idx >= len(superKmers) {
|
|
t.Fatal("read more super-kmers than written")
|
|
}
|
|
expected := superKmers[idx]
|
|
if len(sk.Sequence) != len(expected.Sequence) {
|
|
t.Fatalf("super-kmer %d: length mismatch: got %d, want %d",
|
|
idx, len(sk.Sequence), len(expected.Sequence))
|
|
}
|
|
// Compare nucleotide-by-nucleotide (case insensitive since decode produces lowercase)
|
|
for j := range sk.Sequence {
|
|
got := sk.Sequence[j] | 0x20
|
|
want := expected.Sequence[j] | 0x20
|
|
if got != want {
|
|
t.Fatalf("super-kmer %d pos %d: got %c, want %c", idx, j, got, want)
|
|
}
|
|
}
|
|
idx++
|
|
}
|
|
if idx != len(superKmers) {
|
|
t.Fatalf("read %d super-kmers, want %d", idx, len(superKmers))
|
|
}
|
|
}
|
|
|
|
func TestSkmEmptyFile(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "empty.skm")
|
|
|
|
// Write nothing
|
|
w, err := NewSkmWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Read back
|
|
r, err := NewSkmReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
_, ok := r.Next()
|
|
if ok {
|
|
t.Fatal("expected no super-kmers in empty file")
|
|
}
|
|
}
|
|
|
|
func TestSkmSingleBase(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "single.skm")
|
|
|
|
// Test with sequences of various lengths to check padding
|
|
sequences := [][]byte{
|
|
[]byte("A"),
|
|
[]byte("AC"),
|
|
[]byte("ACG"),
|
|
[]byte("ACGT"),
|
|
[]byte("ACGTA"),
|
|
}
|
|
|
|
w, err := NewSkmWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
for _, seq := range sequences {
|
|
sk := SuperKmer{Sequence: seq}
|
|
if err := w.Write(sk); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
r, err := NewSkmReader(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
defer r.Close()
|
|
|
|
for i, expected := range sequences {
|
|
sk, ok := r.Next()
|
|
if !ok {
|
|
t.Fatalf("expected super-kmer %d, got EOF", i)
|
|
}
|
|
if len(sk.Sequence) != len(expected) {
|
|
t.Fatalf("sk %d: length %d, want %d", i, len(sk.Sequence), len(expected))
|
|
}
|
|
for j := range sk.Sequence {
|
|
got := sk.Sequence[j] | 0x20
|
|
want := expected[j] | 0x20
|
|
if got != want {
|
|
t.Fatalf("sk %d pos %d: got %c, want %c", i, j, got, want)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestSkmFileSize(t *testing.T) {
|
|
dir := t.TempDir()
|
|
path := filepath.Join(dir, "size.skm")
|
|
|
|
// Write a sequence of known length
|
|
seq := []byte("ACGTACGTAC") // 10 bases
|
|
sk := SuperKmer{Sequence: seq}
|
|
|
|
w, err := NewSkmWriter(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Write(sk); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// Expected: 2 bytes (length) + ceil(10/4)=3 bytes (data) = 5 bytes
|
|
info, err := os.Stat(path)
|
|
if err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
if info.Size() != 5 {
|
|
t.Fatalf("file size: got %d, want 5", info.Size())
|
|
}
|
|
}
|