mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
60 lines
2.0 KiB
Go
60 lines
2.0 KiB
Go
|
|
package obikmer
|
||
|
|
|
||
|
|
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
||
|
|
// share the same minimizer.
|
||
|
|
type SuperKmer struct {
|
||
|
|
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
||
|
|
Start int // Starting position in the original sequence (0-indexed)
|
||
|
|
End int // Ending position (exclusive, like Go slice notation)
|
||
|
|
Sequence []byte // The actual DNA subsequence [Start:End]
|
||
|
|
}
|
||
|
|
|
||
|
|
// dequeItem represents an element in the monotone deque used for
|
||
|
|
// tracking minimizers in a sliding window.
|
||
|
|
type dequeItem struct {
|
||
|
|
position int // Position of the m-mer in the sequence
|
||
|
|
canonical uint64 // Canonical (normalized) m-mer value
|
||
|
|
}
|
||
|
|
|
||
|
|
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
||
|
|
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
||
|
|
// share the same minimizer. The minimizer of a k-mer is the smallest
|
||
|
|
// canonical m-mer among its (k-m+1) constituent m-mers.
|
||
|
|
//
|
||
|
|
// This function uses IterSuperKmers internally and collects results into a slice.
|
||
|
|
//
|
||
|
|
// Parameters:
|
||
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||
|
|
// - k: k-mer size (must be between m+1 and 31)
|
||
|
|
// - m: minimizer size (must be between 1 and k-1)
|
||
|
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||
|
|
//
|
||
|
|
// Returns:
|
||
|
|
// - slice of SuperKmer structs representing maximal subsequences
|
||
|
|
// - nil if parameters are invalid or sequence is too short
|
||
|
|
//
|
||
|
|
// Time complexity: O(n) where n is the sequence length
|
||
|
|
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
||
|
|
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
||
|
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
var result []SuperKmer
|
||
|
|
if buffer == nil {
|
||
|
|
estimatedSize := len(seq) / k
|
||
|
|
if estimatedSize < 1 {
|
||
|
|
estimatedSize = 1
|
||
|
|
}
|
||
|
|
result = make([]SuperKmer, 0, estimatedSize)
|
||
|
|
} else {
|
||
|
|
result = (*buffer)[:0]
|
||
|
|
}
|
||
|
|
|
||
|
|
for sk := range IterSuperKmers(seq, k, m) {
|
||
|
|
result = append(result, sk)
|
||
|
|
}
|
||
|
|
|
||
|
|
return result
|
||
|
|
}
|