From 4ae331db36f33a8d822020eea94213b82c687120 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sat, 7 Feb 2026 12:22:59 +0100 Subject: [PATCH] Refactor SuperKmer extraction to use iterator pattern This commit refactors the SuperKmer extraction functionality to use Go's new iterator pattern. The ExtractSuperKmers function is now implemented as a wrapper around a new IterSuperKmers iterator function, which yields results one at a time instead of building a complete slice. This change provides better memory efficiency and more flexible consumption of super k-mers. The functionality remains the same, but the interface is now more idiomatic and efficient for large datasets. --- pkg/obikmer/encodekmer.go | 135 ------------------ pkg/obikmer/superkmer.go | 59 ++++++++ pkg/obikmer/superkmer_iter.go | 215 +++++++++++++++++++++++++++++ pkg/obikmer/superkmer_iter_test.go | 82 +++++++++++ 4 files changed, 356 insertions(+), 135 deletions(-) create mode 100644 pkg/obikmer/superkmer.go create mode 100644 pkg/obikmer/superkmer_iter.go create mode 100644 pkg/obikmer/superkmer_iter_test.go diff --git a/pkg/obikmer/encodekmer.go b/pkg/obikmer/encodekmer.go index fa1acf0..629c47f 100644 --- a/pkg/obikmer/encodekmer.go +++ b/pkg/obikmer/encodekmer.go @@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] { } } } - -// SuperKmer represents a maximal subsequence where all consecutive k-mers -// share the same minimizer. A minimizer is the smallest canonical m-mer -// among the (k-m+1) m-mers contained in a k-mer. -type SuperKmer struct { - Minimizer uint64 // The canonical minimizer value (normalized m-mer) - Start int // Starting position in the original sequence (0-indexed) - End int // Ending position (exclusive, like Go slice notation) - Sequence []byte // The actual DNA subsequence [Start:End] -} - -// dequeItem represents an element in the monotone deque used for -// tracking minimizers in a sliding window. -type dequeItem struct { - position int // Position of the m-mer in the sequence - canonical uint64 // Canonical (normalized) m-mer value -} - -// ExtractSuperKmers extracts super k-mers from a DNA sequence. -// A super k-mer is a maximal subsequence where all consecutive k-mers -// share the same minimizer. The minimizer of a k-mer is the smallest -// canonical m-mer among its (k-m+1) constituent m-mers. -// -// The algorithm uses: -// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation -// - Monotone deque for O(1) amortized minimizer tracking per position -// -// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits -// available for error markers if needed. -// -// Parameters: -// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) -// - k: k-mer size (must be between m+1 and 31) -// - m: minimizer size (must be between 1 and k-1) -// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. -// -// Returns: -// - slice of SuperKmer structs representing maximal subsequences -// - nil if parameters are invalid or sequence is too short -// -// Time complexity: O(n) where n is the sequence length -// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results -func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer { - if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k { - return nil - } - - var result []SuperKmer - if buffer == nil { - estimatedSize := len(seq) / k - if estimatedSize < 1 { - estimatedSize = 1 - } - result = make([]SuperKmer, 0, estimatedSize) - } else { - result = (*buffer)[:0] - } - - deque := make([]dequeItem, 0, k-m+1) - - mMask := uint64(1)<<(m*2) - 1 - rcShift := uint((m - 1) * 2) - - var fwdMmer, rvcMmer uint64 - for i := 0; i < m-1 && i < len(seq); i++ { - code := uint64(__single_base_code__[seq[i]&31]) - fwdMmer = (fwdMmer << 2) | code - rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) - } - - superKmerStart := 0 - var currentMinimizer uint64 - firstKmer := true - - for pos := m - 1; pos < len(seq); pos++ { - code := uint64(__single_base_code__[seq[pos]&31]) - fwdMmer = ((fwdMmer << 2) | code) & mMask - rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) - - canonical := fwdMmer - if rvcMmer < fwdMmer { - canonical = rvcMmer - } - - mmerPos := pos - m + 1 - - if pos >= k-1 { - windowStart := pos - k + 1 - for len(deque) > 0 && deque[0].position < windowStart { - deque = deque[1:] - } - } - - for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical { - deque = deque[:len(deque)-1] - } - - deque = append(deque, dequeItem{position: mmerPos, canonical: canonical}) - - if pos >= k-1 { - newMinimizer := deque[0].canonical - kmerStart := pos - k + 1 - - if firstKmer { - currentMinimizer = newMinimizer - firstKmer = false - } else if newMinimizer != currentMinimizer { - endPos := kmerStart + k - 1 - superKmer := SuperKmer{ - Minimizer: currentMinimizer, - Start: superKmerStart, - End: endPos, - Sequence: seq[superKmerStart:endPos], - } - result = append(result, superKmer) - - superKmerStart = kmerStart - currentMinimizer = newMinimizer - } - } - } - - if !firstKmer { - superKmer := SuperKmer{ - Minimizer: currentMinimizer, - Start: superKmerStart, - End: len(seq), - Sequence: seq[superKmerStart:], - } - result = append(result, superKmer) - } - - return result -} - // ReverseComplement computes the reverse complement of an encoded k-mer. // The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11). // The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11. diff --git a/pkg/obikmer/superkmer.go b/pkg/obikmer/superkmer.go new file mode 100644 index 0000000..871966a --- /dev/null +++ b/pkg/obikmer/superkmer.go @@ -0,0 +1,59 @@ +package obikmer + +// SuperKmer represents a maximal subsequence where all consecutive k-mers +// share the same minimizer. +type SuperKmer struct { + Minimizer uint64 // The canonical minimizer value (normalized m-mer) + Start int // Starting position in the original sequence (0-indexed) + End int // Ending position (exclusive, like Go slice notation) + Sequence []byte // The actual DNA subsequence [Start:End] +} + +// dequeItem represents an element in the monotone deque used for +// tracking minimizers in a sliding window. +type dequeItem struct { + position int // Position of the m-mer in the sequence + canonical uint64 // Canonical (normalized) m-mer value +} + +// ExtractSuperKmers extracts super k-mers from a DNA sequence. +// A super k-mer is a maximal subsequence where all consecutive k-mers +// share the same minimizer. The minimizer of a k-mer is the smallest +// canonical m-mer among its (k-m+1) constituent m-mers. +// +// This function uses IterSuperKmers internally and collects results into a slice. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between m+1 and 31) +// - m: minimizer size (must be between 1 and k-1) +// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. +// +// Returns: +// - slice of SuperKmer structs representing maximal subsequences +// - nil if parameters are invalid or sequence is too short +// +// Time complexity: O(n) where n is the sequence length +// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results +func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer { + if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k { + return nil + } + + var result []SuperKmer + if buffer == nil { + estimatedSize := len(seq) / k + if estimatedSize < 1 { + estimatedSize = 1 + } + result = make([]SuperKmer, 0, estimatedSize) + } else { + result = (*buffer)[:0] + } + + for sk := range IterSuperKmers(seq, k, m) { + result = append(result, sk) + } + + return result +} diff --git a/pkg/obikmer/superkmer_iter.go b/pkg/obikmer/superkmer_iter.go new file mode 100644 index 0000000..8f4386b --- /dev/null +++ b/pkg/obikmer/superkmer_iter.go @@ -0,0 +1,215 @@ +package obikmer + +import ( + "fmt" + "iter" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +// IterSuperKmers returns an iterator over super k-mers extracted from a DNA sequence. +// It uses the same algorithm as ExtractSuperKmers but yields super k-mers one at a time. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between m+1 and 31) +// - m: minimizer size (must be between 1 and k-1) +// +// Returns: +// - An iterator that yields SuperKmer structs +// +// Example: +// +// for sk := range IterSuperKmers(sequence, 21, 11) { +// fmt.Printf("SuperKmer at %d-%d with minimizer %d\n", sk.Start, sk.End, sk.Minimizer) +// } +func IterSuperKmers(seq []byte, k int, m int) iter.Seq[SuperKmer] { + return func(yield func(SuperKmer) bool) { + if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k { + return + } + + deque := make([]dequeItem, 0, k-m+1) + + mMask := uint64(1)<<(m*2) - 1 + rcShift := uint((m - 1) * 2) + + var fwdMmer, rvcMmer uint64 + for i := 0; i < m-1 && i < len(seq); i++ { + code := uint64(__single_base_code__[seq[i]&31]) + fwdMmer = (fwdMmer << 2) | code + rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) + } + + superKmerStart := 0 + var currentMinimizer uint64 + firstKmer := true + + for pos := m - 1; pos < len(seq); pos++ { + code := uint64(__single_base_code__[seq[pos]&31]) + fwdMmer = ((fwdMmer << 2) | code) & mMask + rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift) + + canonical := fwdMmer + if rvcMmer < fwdMmer { + canonical = rvcMmer + } + + mmerPos := pos - m + 1 + + if pos >= k-1 { + windowStart := pos - k + 1 + for len(deque) > 0 && deque[0].position < windowStart { + deque = deque[1:] + } + } + + for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical { + deque = deque[:len(deque)-1] + } + + deque = append(deque, dequeItem{position: mmerPos, canonical: canonical}) + + if pos >= k-1 { + newMinimizer := deque[0].canonical + kmerStart := pos - k + 1 + + if firstKmer { + currentMinimizer = newMinimizer + firstKmer = false + } else if newMinimizer != currentMinimizer { + endPos := kmerStart + k - 1 + superKmer := SuperKmer{ + Minimizer: currentMinimizer, + Start: superKmerStart, + End: endPos, + Sequence: seq[superKmerStart:endPos], + } + if !yield(superKmer) { + return + } + + superKmerStart = kmerStart + currentMinimizer = newMinimizer + } + } + } + + if !firstKmer { + superKmer := SuperKmer{ + Minimizer: currentMinimizer, + Start: superKmerStart, + End: len(seq), + Sequence: seq[superKmerStart:], + } + yield(superKmer) + } + } +} + +// ToBioSequence converts a SuperKmer to a BioSequence with metadata. +// +// The resulting BioSequence contains: +// - ID: "{parentID}_superkmer_{start}_{end}" +// - Sequence: the actual DNA subsequence +// - Attributes: +// - "minimizer_value" (uint64): the canonical minimizer value +// - "minimizer_seq" (string): the DNA sequence of the minimizer +// - "k" (int): the k-mer size +// - "m" (int): the minimizer size +// - "start" (int): starting position in original sequence +// - "end" (int): ending position in original sequence +// - "parent_id" (string): ID of the parent sequence +// +// Parameters: +// - k: k-mer size used for extraction +// - m: minimizer size used for extraction +// - parentID: ID of the parent sequence +// - parentSource: source field from the parent sequence +// +// Returns: +// - *obiseq.BioSequence: A new BioSequence representing this super k-mer +func (sk *SuperKmer) ToBioSequence(k int, m int, parentID string, parentSource string) *obiseq.BioSequence { + // Create ID for the super-kmer + var id string + if parentID != "" { + id = fmt.Sprintf("%s_superkmer_%d_%d", parentID, sk.Start, sk.End) + } else { + id = fmt.Sprintf("superkmer_%d_%d", sk.Start, sk.End) + } + + // Create the BioSequence + seq := obiseq.NewBioSequence(id, sk.Sequence, "") + + // Copy source from parent + if parentSource != "" { + seq.SetSource(parentSource) + } + + // Set attributes + seq.SetAttribute("minimizer_value", sk.Minimizer) + + // Decode the minimizer to get its DNA sequence + minimizerSeq := DecodeKmer(sk.Minimizer, m, nil) + seq.SetAttribute("minimizer_seq", string(minimizerSeq)) + + seq.SetAttribute("k", k) + seq.SetAttribute("m", m) + seq.SetAttribute("start", sk.Start) + seq.SetAttribute("end", sk.End) + + if parentID != "" { + seq.SetAttribute("parent_id", parentID) + } + + return seq +} + +// SuperKmerWorker creates a SeqWorker that extracts super k-mers from a BioSequence +// and returns them as a slice of BioSequence objects. +// +// The worker copies the source field from the parent sequence to all extracted super k-mers. +// +// Parameters: +// - k: k-mer size (must be between m+1 and 31) +// - m: minimizer size (must be between 1 and k-1) +// +// Returns: +// - SeqWorker: A worker function that can be used in obiiter pipelines +// +// Example: +// +// worker := SuperKmerWorker(21, 11) +// iterator := iterator.MakeIWorker(worker, false) +func SuperKmerWorker(k int, m int) obiseq.SeqWorker { + return func(seq *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + if seq == nil { + return obiseq.BioSequenceSlice{}, nil + } + + // Validate parameters + if m < 1 || m >= k || k < 2 || k > 31 { + return obiseq.BioSequenceSlice{}, fmt.Errorf( + "invalid parameters: k=%d, m=%d (need 1 <= m < k <= 31)", + k, m) + } + + sequence := seq.Sequence() + if len(sequence) < k { + return obiseq.BioSequenceSlice{}, nil + } + + parentID := seq.Id() + parentSource := seq.Source() + + // Extract super k-mers and convert to BioSequences + result := make(obiseq.BioSequenceSlice, 0) + + for sk := range IterSuperKmers(sequence, k, m) { + bioSeq := sk.ToBioSequence(k, m, parentID, parentSource) + result = append(result, bioSeq) + } + + return result, nil + } +} diff --git a/pkg/obikmer/superkmer_iter_test.go b/pkg/obikmer/superkmer_iter_test.go new file mode 100644 index 0000000..5ddda18 --- /dev/null +++ b/pkg/obikmer/superkmer_iter_test.go @@ -0,0 +1,82 @@ +package obikmer + +import ( + "testing" +) + +func TestIterSuperKmers(t *testing.T) { + seq := []byte("ACGTACGTGGGGAAAA") + k := 5 + m := 3 + + count := 0 + for sk := range IterSuperKmers(seq, k, m) { + count++ + t.Logf("SuperKmer %d: Minimizer=%d, Start=%d, End=%d, Seq=%s", + count, sk.Minimizer, sk.Start, sk.End, string(sk.Sequence)) + + // Verify sequence boundaries + if sk.Start < 0 || sk.End > len(seq) { + t.Errorf("Invalid boundaries: Start=%d, End=%d, seqLen=%d", + sk.Start, sk.End, len(seq)) + } + + // Verify sequence content + if string(sk.Sequence) != string(seq[sk.Start:sk.End]) { + t.Errorf("Sequence mismatch: expected %s, got %s", + string(seq[sk.Start:sk.End]), string(sk.Sequence)) + } + } + + if count == 0 { + t.Error("No super k-mers extracted") + } + + t.Logf("Total super k-mers extracted: %d", count) +} + +func TestIterSuperKmersVsSlice(t *testing.T) { + seq := []byte("ACGTACGTGGGGAAAAACGTACGT") + k := 7 + m := 4 + + // Extract using slice version + sliceResult := ExtractSuperKmers(seq, k, m, nil) + + // Extract using iterator version + var iterResult []SuperKmer + for sk := range IterSuperKmers(seq, k, m) { + iterResult = append(iterResult, sk) + } + + // Compare counts + if len(sliceResult) != len(iterResult) { + t.Errorf("Different number of super k-mers: slice=%d, iter=%d", + len(sliceResult), len(iterResult)) + } + + // Compare each super k-mer + for i := 0; i < len(sliceResult) && i < len(iterResult); i++ { + slice := sliceResult[i] + iter := iterResult[i] + + if slice.Minimizer != iter.Minimizer { + t.Errorf("SuperKmer %d: different minimizers: slice=%d, iter=%d", + i, slice.Minimizer, iter.Minimizer) + } + + if slice.Start != iter.Start || slice.End != iter.End { + t.Errorf("SuperKmer %d: different boundaries: slice=[%d:%d], iter=[%d:%d]", + i, slice.Start, slice.End, iter.Start, iter.End) + } + + if string(slice.Sequence) != string(iter.Sequence) { + t.Errorf("SuperKmer %d: different sequences: slice=%s, iter=%s", + i, string(slice.Sequence), string(iter.Sequence)) + } + } +} + +// Note: Tests for ToBioSequence and SuperKmerWorker are in a separate +// integration test package to avoid circular dependencies between +// obikmer and obiseq packages.