mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Refactor SuperKmer extraction to use iterator pattern
This commit refactors the SuperKmer extraction functionality to use Go's new iterator pattern. The ExtractSuperKmers function is now implemented as a wrapper around a new IterSuperKmers iterator function, which yields results one at a time instead of building a complete slice. This change provides better memory efficiency and more flexible consumption of super k-mers. The functionality remains the same, but the interface is now more idiomatic and efficient for large datasets.
This commit is contained in:
@@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
|
||||||
// share the same minimizer. A minimizer is the smallest canonical m-mer
|
|
||||||
// among the (k-m+1) m-mers contained in a k-mer.
|
|
||||||
type SuperKmer struct {
|
|
||||||
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
|
||||||
Start int // Starting position in the original sequence (0-indexed)
|
|
||||||
End int // Ending position (exclusive, like Go slice notation)
|
|
||||||
Sequence []byte // The actual DNA subsequence [Start:End]
|
|
||||||
}
|
|
||||||
|
|
||||||
// dequeItem represents an element in the monotone deque used for
|
|
||||||
// tracking minimizers in a sliding window.
|
|
||||||
type dequeItem struct {
|
|
||||||
position int // Position of the m-mer in the sequence
|
|
||||||
canonical uint64 // Canonical (normalized) m-mer value
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
|
||||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
|
||||||
// share the same minimizer. The minimizer of a k-mer is the smallest
|
|
||||||
// canonical m-mer among its (k-m+1) constituent m-mers.
|
|
||||||
//
|
|
||||||
// The algorithm uses:
|
|
||||||
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
|
|
||||||
// - Monotone deque for O(1) amortized minimizer tracking per position
|
|
||||||
//
|
|
||||||
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
||||||
// available for error markers if needed.
|
|
||||||
//
|
|
||||||
// Parameters:
|
|
||||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
||||||
// - k: k-mer size (must be between m+1 and 31)
|
|
||||||
// - m: minimizer size (must be between 1 and k-1)
|
|
||||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
||||||
//
|
|
||||||
// Returns:
|
|
||||||
// - slice of SuperKmer structs representing maximal subsequences
|
|
||||||
// - nil if parameters are invalid or sequence is too short
|
|
||||||
//
|
|
||||||
// Time complexity: O(n) where n is the sequence length
|
|
||||||
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
|
||||||
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
|
||||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
var result []SuperKmer
|
|
||||||
if buffer == nil {
|
|
||||||
estimatedSize := len(seq) / k
|
|
||||||
if estimatedSize < 1 {
|
|
||||||
estimatedSize = 1
|
|
||||||
}
|
|
||||||
result = make([]SuperKmer, 0, estimatedSize)
|
|
||||||
} else {
|
|
||||||
result = (*buffer)[:0]
|
|
||||||
}
|
|
||||||
|
|
||||||
deque := make([]dequeItem, 0, k-m+1)
|
|
||||||
|
|
||||||
mMask := uint64(1)<<(m*2) - 1
|
|
||||||
rcShift := uint((m - 1) * 2)
|
|
||||||
|
|
||||||
var fwdMmer, rvcMmer uint64
|
|
||||||
for i := 0; i < m-1 && i < len(seq); i++ {
|
|
||||||
code := uint64(__single_base_code__[seq[i]&31])
|
|
||||||
fwdMmer = (fwdMmer << 2) | code
|
|
||||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
||||||
}
|
|
||||||
|
|
||||||
superKmerStart := 0
|
|
||||||
var currentMinimizer uint64
|
|
||||||
firstKmer := true
|
|
||||||
|
|
||||||
for pos := m - 1; pos < len(seq); pos++ {
|
|
||||||
code := uint64(__single_base_code__[seq[pos]&31])
|
|
||||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
|
||||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
||||||
|
|
||||||
canonical := fwdMmer
|
|
||||||
if rvcMmer < fwdMmer {
|
|
||||||
canonical = rvcMmer
|
|
||||||
}
|
|
||||||
|
|
||||||
mmerPos := pos - m + 1
|
|
||||||
|
|
||||||
if pos >= k-1 {
|
|
||||||
windowStart := pos - k + 1
|
|
||||||
for len(deque) > 0 && deque[0].position < windowStart {
|
|
||||||
deque = deque[1:]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
|
||||||
deque = deque[:len(deque)-1]
|
|
||||||
}
|
|
||||||
|
|
||||||
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
|
||||||
|
|
||||||
if pos >= k-1 {
|
|
||||||
newMinimizer := deque[0].canonical
|
|
||||||
kmerStart := pos - k + 1
|
|
||||||
|
|
||||||
if firstKmer {
|
|
||||||
currentMinimizer = newMinimizer
|
|
||||||
firstKmer = false
|
|
||||||
} else if newMinimizer != currentMinimizer {
|
|
||||||
endPos := kmerStart + k - 1
|
|
||||||
superKmer := SuperKmer{
|
|
||||||
Minimizer: currentMinimizer,
|
|
||||||
Start: superKmerStart,
|
|
||||||
End: endPos,
|
|
||||||
Sequence: seq[superKmerStart:endPos],
|
|
||||||
}
|
|
||||||
result = append(result, superKmer)
|
|
||||||
|
|
||||||
superKmerStart = kmerStart
|
|
||||||
currentMinimizer = newMinimizer
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !firstKmer {
|
|
||||||
superKmer := SuperKmer{
|
|
||||||
Minimizer: currentMinimizer,
|
|
||||||
Start: superKmerStart,
|
|
||||||
End: len(seq),
|
|
||||||
Sequence: seq[superKmerStart:],
|
|
||||||
}
|
|
||||||
result = append(result, superKmer)
|
|
||||||
}
|
|
||||||
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
|
|
||||||
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
||||||
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
||||||
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
||||||
|
|||||||
59
pkg/obikmer/superkmer.go
Normal file
59
pkg/obikmer/superkmer.go
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||||||
|
// share the same minimizer.
|
||||||
|
type SuperKmer struct {
|
||||||
|
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||||||
|
Start int // Starting position in the original sequence (0-indexed)
|
||||||
|
End int // Ending position (exclusive, like Go slice notation)
|
||||||
|
Sequence []byte // The actual DNA subsequence [Start:End]
|
||||||
|
}
|
||||||
|
|
||||||
|
// dequeItem represents an element in the monotone deque used for
|
||||||
|
// tracking minimizers in a sliding window.
|
||||||
|
type dequeItem struct {
|
||||||
|
position int // Position of the m-mer in the sequence
|
||||||
|
canonical uint64 // Canonical (normalized) m-mer value
|
||||||
|
}
|
||||||
|
|
||||||
|
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||||||
|
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||||
|
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||||||
|
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||||||
|
//
|
||||||
|
// This function uses IterSuperKmers internally and collects results into a slice.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - slice of SuperKmer structs representing maximal subsequences
|
||||||
|
// - nil if parameters are invalid or sequence is too short
|
||||||
|
//
|
||||||
|
// Time complexity: O(n) where n is the sequence length
|
||||||
|
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||||||
|
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var result []SuperKmer
|
||||||
|
if buffer == nil {
|
||||||
|
estimatedSize := len(seq) / k
|
||||||
|
if estimatedSize < 1 {
|
||||||
|
estimatedSize = 1
|
||||||
|
}
|
||||||
|
result = make([]SuperKmer, 0, estimatedSize)
|
||||||
|
} else {
|
||||||
|
result = (*buffer)[:0]
|
||||||
|
}
|
||||||
|
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
result = append(result, sk)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
}
|
||||||
215
pkg/obikmer/superkmer_iter.go
Normal file
215
pkg/obikmer/superkmer_iter.go
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"iter"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// IterSuperKmers returns an iterator over super k-mers extracted from a DNA sequence.
|
||||||
|
// It uses the same algorithm as ExtractSuperKmers but yields super k-mers one at a time.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - An iterator that yields SuperKmer structs
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// for sk := range IterSuperKmers(sequence, 21, 11) {
|
||||||
|
// fmt.Printf("SuperKmer at %d-%d with minimizer %d\n", sk.Start, sk.End, sk.Minimizer)
|
||||||
|
// }
|
||||||
|
func IterSuperKmers(seq []byte, k int, m int) iter.Seq[SuperKmer] {
|
||||||
|
return func(yield func(SuperKmer) bool) {
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
deque := make([]dequeItem, 0, k-m+1)
|
||||||
|
|
||||||
|
mMask := uint64(1)<<(m*2) - 1
|
||||||
|
rcShift := uint((m - 1) * 2)
|
||||||
|
|
||||||
|
var fwdMmer, rvcMmer uint64
|
||||||
|
for i := 0; i < m-1 && i < len(seq); i++ {
|
||||||
|
code := uint64(__single_base_code__[seq[i]&31])
|
||||||
|
fwdMmer = (fwdMmer << 2) | code
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
}
|
||||||
|
|
||||||
|
superKmerStart := 0
|
||||||
|
var currentMinimizer uint64
|
||||||
|
firstKmer := true
|
||||||
|
|
||||||
|
for pos := m - 1; pos < len(seq); pos++ {
|
||||||
|
code := uint64(__single_base_code__[seq[pos]&31])
|
||||||
|
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||||
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||||
|
|
||||||
|
canonical := fwdMmer
|
||||||
|
if rvcMmer < fwdMmer {
|
||||||
|
canonical = rvcMmer
|
||||||
|
}
|
||||||
|
|
||||||
|
mmerPos := pos - m + 1
|
||||||
|
|
||||||
|
if pos >= k-1 {
|
||||||
|
windowStart := pos - k + 1
|
||||||
|
for len(deque) > 0 && deque[0].position < windowStart {
|
||||||
|
deque = deque[1:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
||||||
|
deque = deque[:len(deque)-1]
|
||||||
|
}
|
||||||
|
|
||||||
|
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
||||||
|
|
||||||
|
if pos >= k-1 {
|
||||||
|
newMinimizer := deque[0].canonical
|
||||||
|
kmerStart := pos - k + 1
|
||||||
|
|
||||||
|
if firstKmer {
|
||||||
|
currentMinimizer = newMinimizer
|
||||||
|
firstKmer = false
|
||||||
|
} else if newMinimizer != currentMinimizer {
|
||||||
|
endPos := kmerStart + k - 1
|
||||||
|
superKmer := SuperKmer{
|
||||||
|
Minimizer: currentMinimizer,
|
||||||
|
Start: superKmerStart,
|
||||||
|
End: endPos,
|
||||||
|
Sequence: seq[superKmerStart:endPos],
|
||||||
|
}
|
||||||
|
if !yield(superKmer) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
superKmerStart = kmerStart
|
||||||
|
currentMinimizer = newMinimizer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !firstKmer {
|
||||||
|
superKmer := SuperKmer{
|
||||||
|
Minimizer: currentMinimizer,
|
||||||
|
Start: superKmerStart,
|
||||||
|
End: len(seq),
|
||||||
|
Sequence: seq[superKmerStart:],
|
||||||
|
}
|
||||||
|
yield(superKmer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToBioSequence converts a SuperKmer to a BioSequence with metadata.
|
||||||
|
//
|
||||||
|
// The resulting BioSequence contains:
|
||||||
|
// - ID: "{parentID}_superkmer_{start}_{end}"
|
||||||
|
// - Sequence: the actual DNA subsequence
|
||||||
|
// - Attributes:
|
||||||
|
// - "minimizer_value" (uint64): the canonical minimizer value
|
||||||
|
// - "minimizer_seq" (string): the DNA sequence of the minimizer
|
||||||
|
// - "k" (int): the k-mer size
|
||||||
|
// - "m" (int): the minimizer size
|
||||||
|
// - "start" (int): starting position in original sequence
|
||||||
|
// - "end" (int): ending position in original sequence
|
||||||
|
// - "parent_id" (string): ID of the parent sequence
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: k-mer size used for extraction
|
||||||
|
// - m: minimizer size used for extraction
|
||||||
|
// - parentID: ID of the parent sequence
|
||||||
|
// - parentSource: source field from the parent sequence
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - *obiseq.BioSequence: A new BioSequence representing this super k-mer
|
||||||
|
func (sk *SuperKmer) ToBioSequence(k int, m int, parentID string, parentSource string) *obiseq.BioSequence {
|
||||||
|
// Create ID for the super-kmer
|
||||||
|
var id string
|
||||||
|
if parentID != "" {
|
||||||
|
id = fmt.Sprintf("%s_superkmer_%d_%d", parentID, sk.Start, sk.End)
|
||||||
|
} else {
|
||||||
|
id = fmt.Sprintf("superkmer_%d_%d", sk.Start, sk.End)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the BioSequence
|
||||||
|
seq := obiseq.NewBioSequence(id, sk.Sequence, "")
|
||||||
|
|
||||||
|
// Copy source from parent
|
||||||
|
if parentSource != "" {
|
||||||
|
seq.SetSource(parentSource)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set attributes
|
||||||
|
seq.SetAttribute("minimizer_value", sk.Minimizer)
|
||||||
|
|
||||||
|
// Decode the minimizer to get its DNA sequence
|
||||||
|
minimizerSeq := DecodeKmer(sk.Minimizer, m, nil)
|
||||||
|
seq.SetAttribute("minimizer_seq", string(minimizerSeq))
|
||||||
|
|
||||||
|
seq.SetAttribute("k", k)
|
||||||
|
seq.SetAttribute("m", m)
|
||||||
|
seq.SetAttribute("start", sk.Start)
|
||||||
|
seq.SetAttribute("end", sk.End)
|
||||||
|
|
||||||
|
if parentID != "" {
|
||||||
|
seq.SetAttribute("parent_id", parentID)
|
||||||
|
}
|
||||||
|
|
||||||
|
return seq
|
||||||
|
}
|
||||||
|
|
||||||
|
// SuperKmerWorker creates a SeqWorker that extracts super k-mers from a BioSequence
|
||||||
|
// and returns them as a slice of BioSequence objects.
|
||||||
|
//
|
||||||
|
// The worker copies the source field from the parent sequence to all extracted super k-mers.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - k: k-mer size (must be between m+1 and 31)
|
||||||
|
// - m: minimizer size (must be between 1 and k-1)
|
||||||
|
//
|
||||||
|
// Returns:
|
||||||
|
// - SeqWorker: A worker function that can be used in obiiter pipelines
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// worker := SuperKmerWorker(21, 11)
|
||||||
|
// iterator := iterator.MakeIWorker(worker, false)
|
||||||
|
func SuperKmerWorker(k int, m int) obiseq.SeqWorker {
|
||||||
|
return func(seq *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
|
if seq == nil {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate parameters
|
||||||
|
if m < 1 || m >= k || k < 2 || k > 31 {
|
||||||
|
return obiseq.BioSequenceSlice{}, fmt.Errorf(
|
||||||
|
"invalid parameters: k=%d, m=%d (need 1 <= m < k <= 31)",
|
||||||
|
k, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
sequence := seq.Sequence()
|
||||||
|
if len(sequence) < k {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
parentID := seq.Id()
|
||||||
|
parentSource := seq.Source()
|
||||||
|
|
||||||
|
// Extract super k-mers and convert to BioSequences
|
||||||
|
result := make(obiseq.BioSequenceSlice, 0)
|
||||||
|
|
||||||
|
for sk := range IterSuperKmers(sequence, k, m) {
|
||||||
|
bioSeq := sk.ToBioSequence(k, m, parentID, parentSource)
|
||||||
|
result = append(result, bioSeq)
|
||||||
|
}
|
||||||
|
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
82
pkg/obikmer/superkmer_iter_test.go
Normal file
82
pkg/obikmer/superkmer_iter_test.go
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
package obikmer
|
||||||
|
|
||||||
|
import (
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestIterSuperKmers(t *testing.T) {
|
||||||
|
seq := []byte("ACGTACGTGGGGAAAA")
|
||||||
|
k := 5
|
||||||
|
m := 3
|
||||||
|
|
||||||
|
count := 0
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
count++
|
||||||
|
t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s",
|
||||||
|
count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence))
|
||||||
|
|
||||||
|
// Verify sequence boundaries
|
||||||
|
if sk.Start < 0 || sk.End > len(seq) {
|
||||||
|
t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d",
|
||||||
|
sk.Start, sk.End, len(seq))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify sequence content
|
||||||
|
if string(sk.Sequence) != string(seq[sk.Start:sk.End]) {
|
||||||
|
t.Errorf("Sequence mismatch: expected %s, got %s",
|
||||||
|
string(seq[sk.Start:sk.End]), string(sk.Sequence))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if count == 0 {
|
||||||
|
t.Error("No super k-mers extracted")
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Logf("Total super k-mers extracted: %d", count)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIterSuperKmersVsSlice(t *testing.T) {
|
||||||
|
seq := []byte("ACGTACGTGGGGAAAAACGTACGT")
|
||||||
|
k := 7
|
||||||
|
m := 4
|
||||||
|
|
||||||
|
// Extract using slice version
|
||||||
|
sliceResult := ExtractSuperKmers(seq, k, m, nil)
|
||||||
|
|
||||||
|
// Extract using iterator version
|
||||||
|
var iterResult []SuperKmer
|
||||||
|
for sk := range IterSuperKmers(seq, k, m) {
|
||||||
|
iterResult = append(iterResult, sk)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare counts
|
||||||
|
if len(sliceResult) != len(iterResult) {
|
||||||
|
t.Errorf("Different number of super k-mers: slice=%d, iter=%d",
|
||||||
|
len(sliceResult), len(iterResult))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare each super k-mer
|
||||||
|
for i := 0; i < len(sliceResult) && i < len(iterResult); i++ {
|
||||||
|
slice := sliceResult[i]
|
||||||
|
iter := iterResult[i]
|
||||||
|
|
||||||
|
if slice.Minimizer != iter.Minimizer {
|
||||||
|
t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d",
|
||||||
|
i, slice.Minimizer, iter.Minimizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
if slice.Start != iter.Start || slice.End != iter.End {
|
||||||
|
t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]",
|
||||||
|
i, slice.Start, slice.End, iter.Start, iter.End)
|
||||||
|
}
|
||||||
|
|
||||||
|
if string(slice.Sequence) != string(iter.Sequence) {
|
||||||
|
t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s",
|
||||||
|
i, string(slice.Sequence), string(iter.Sequence))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate
|
||||||
|
// integration test package to avoid circular dependencies between
|
||||||
|
// obikmer and obiseq packages.
|
||||||
Reference in New Issue
Block a user