mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
Refactor SuperKmer extraction to use iterator pattern
This commit refactors the SuperKmer extraction functionality to use Go's new iterator pattern. The ExtractSuperKmers function is now implemented as a wrapper around a new IterSuperKmers iterator function, which yields results one at a time instead of building a complete slice. This change provides better memory efficiency and more flexible consumption of super k-mers. The functionality remains the same, but the interface is now more idiomatic and efficient for large datasets.
This commit is contained in:
@@ -447,141 +447,6 @@ func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. A minimizer is the smallest canonical m-mer
|
||||
// among the (k-m+1) m-mers contained in a k-mer.
|
||||
type SuperKmer struct {
|
||||
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||||
Start int // Starting position in the original sequence (0-indexed)
|
||||
End int // Ending position (exclusive, like Go slice notation)
|
||||
Sequence []byte // The actual DNA subsequence [Start:End]
|
||||
}
|
||||
|
||||
// dequeItem represents an element in the monotone deque used for
|
||||
// tracking minimizers in a sliding window.
|
||||
type dequeItem struct {
|
||||
position int // Position of the m-mer in the sequence
|
||||
canonical uint64 // Canonical (normalized) m-mer value
|
||||
}
|
||||
|
||||
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||||
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||||
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||||
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||||
//
|
||||
// The algorithm uses:
|
||||
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
|
||||
// - Monotone deque for O(1) amortized minimizer tracking per position
|
||||
//
|
||||
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
||||
// available for error markers if needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between m+1 and 31)
|
||||
// - m: minimizer size (must be between 1 and k-1)
|
||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||
//
|
||||
// Returns:
|
||||
// - slice of SuperKmer structs representing maximal subsequences
|
||||
// - nil if parameters are invalid or sequence is too short
|
||||
//
|
||||
// Time complexity: O(n) where n is the sequence length
|
||||
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||||
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||||
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||||
return nil
|
||||
}
|
||||
|
||||
var result []SuperKmer
|
||||
if buffer == nil {
|
||||
estimatedSize := len(seq) / k
|
||||
if estimatedSize < 1 {
|
||||
estimatedSize = 1
|
||||
}
|
||||
result = make([]SuperKmer, 0, estimatedSize)
|
||||
} else {
|
||||
result = (*buffer)[:0]
|
||||
}
|
||||
|
||||
deque := make([]dequeItem, 0, k-m+1)
|
||||
|
||||
mMask := uint64(1)<<(m*2) - 1
|
||||
rcShift := uint((m - 1) * 2)
|
||||
|
||||
var fwdMmer, rvcMmer uint64
|
||||
for i := 0; i < m-1 && i < len(seq); i++ {
|
||||
code := uint64(__single_base_code__[seq[i]&31])
|
||||
fwdMmer = (fwdMmer << 2) | code
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
}
|
||||
|
||||
superKmerStart := 0
|
||||
var currentMinimizer uint64
|
||||
firstKmer := true
|
||||
|
||||
for pos := m - 1; pos < len(seq); pos++ {
|
||||
code := uint64(__single_base_code__[seq[pos]&31])
|
||||
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
||||
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
||||
|
||||
canonical := fwdMmer
|
||||
if rvcMmer < fwdMmer {
|
||||
canonical = rvcMmer
|
||||
}
|
||||
|
||||
mmerPos := pos - m + 1
|
||||
|
||||
if pos >= k-1 {
|
||||
windowStart := pos - k + 1
|
||||
for len(deque) > 0 && deque[0].position < windowStart {
|
||||
deque = deque[1:]
|
||||
}
|
||||
}
|
||||
|
||||
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
||||
deque = deque[:len(deque)-1]
|
||||
}
|
||||
|
||||
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
||||
|
||||
if pos >= k-1 {
|
||||
newMinimizer := deque[0].canonical
|
||||
kmerStart := pos - k + 1
|
||||
|
||||
if firstKmer {
|
||||
currentMinimizer = newMinimizer
|
||||
firstKmer = false
|
||||
} else if newMinimizer != currentMinimizer {
|
||||
endPos := kmerStart + k - 1
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: endPos,
|
||||
Sequence: seq[superKmerStart:endPos],
|
||||
}
|
||||
result = append(result, superKmer)
|
||||
|
||||
superKmerStart = kmerStart
|
||||
currentMinimizer = newMinimizer
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !firstKmer {
|
||||
superKmer := SuperKmer{
|
||||
Minimizer: currentMinimizer,
|
||||
Start: superKmerStart,
|
||||
End: len(seq),
|
||||
Sequence: seq[superKmerStart:],
|
||||
}
|
||||
result = append(result, superKmer)
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
||||
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
||||
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
||||
|
||||
Reference in New Issue
Block a user