Refactor SuperKmer extraction to use iterator pattern

This commit refactors the SuperKmer extraction functionality to use Go's new iterator pattern. The ExtractSuperKmers function is now implemented as a wrapper around a new IterSuperKmers iterator function, which yields results one at a time instead of building a complete slice. This change provides better memory efficiency and more flexible consumption of super k-mers. The functionality remains the same, but the interface is now more idiomatic and efficient for large datasets.
This commit is contained in:
Eric Coissac
2026-02-07 12:22:59 +01:00
parent f1e2846d2d
commit 4ae331db36
4 changed files with 356 additions and 135 deletions

View File

@@ -0,0 +1,82 @@
package obikmer
import (
"testing"
)
func TestIterSuperKmers(t *testing.T) {
seq := []byte("ACGTACGTGGGGAAAA")
k := 5
m := 3
count := 0
for sk := range IterSuperKmers(seq, k, m) {
count++
t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s",
count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence))
// Verify sequence boundaries
if sk.Start < 0 || sk.End > len(seq) {
t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d",
sk.Start, sk.End, len(seq))
}
// Verify sequence content
if string(sk.Sequence) != string(seq[sk.Start:sk.End]) {
t.Errorf("Sequence mismatch: expected %s, got %s",
string(seq[sk.Start:sk.End]), string(sk.Sequence))
}
}
if count == 0 {
t.Error("No super k-mers extracted")
}
t.Logf("Total super k-mers extracted: %d", count)
}
func TestIterSuperKmersVsSlice(t *testing.T) {
seq := []byte("ACGTACGTGGGGAAAAACGTACGT")
k := 7
m := 4
// Extract using slice version
sliceResult := ExtractSuperKmers(seq, k, m, nil)
// Extract using iterator version
var iterResult []SuperKmer
for sk := range IterSuperKmers(seq, k, m) {
iterResult = append(iterResult, sk)
}
// Compare counts
if len(sliceResult) != len(iterResult) {
t.Errorf("Different number of super k-mers: slice=%d, iter=%d",
len(sliceResult), len(iterResult))
}
// Compare each super k-mer
for i := 0; i < len(sliceResult) && i < len(iterResult); i++ {
slice := sliceResult[i]
iter := iterResult[i]
if slice.Minimizer != iter.Minimizer {
t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d",
i, slice.Minimizer, iter.Minimizer)
}
if slice.Start != iter.Start || slice.End != iter.End {
t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]",
i, slice.Start, slice.End, iter.Start, iter.End)
}
if string(slice.Sequence) != string(iter.Sequence) {
t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s",
i, string(slice.Sequence), string(iter.Sequence))
}
}
}
// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
// integration test package to avoid circular dependencies between
// obikmer and obiseq packages.