mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 13:30:52 +00:00
This commit refactors all k-mer encoding and normalization functions to consistently use 'canonical' instead of 'normalized' terminology. This includes renaming functions like EncodeNormalizedKmer to EncodeCanonicalKmer, IterNormalizedKmers to IterCanonicalKmers, and NormalizeKmer to CanonicalKmer. The change aligns the API with biological conventions where 'canonical' refers to the lexicographically smallest representation of a k-mer and its reverse complement. All related documentation and examples have been updated accordingly. The commit also updates the version file with a new commit hash.
904 lines
23 KiB
Go
904 lines
23 KiB
Go
package obikmer
|
|
|
|
import "iter"
|
|
|
|
var __single_base_code_err__ = []byte{0,
|
|
// A, B, C, D,
|
|
0, 0xFF, 1, 0xFF,
|
|
// E, F, G, H,
|
|
0xFF, 0xFF, 2, 0xFF,
|
|
// I, J, K, L,
|
|
0xFF, 0xFF, 0xFF, 0xFF,
|
|
// M, N, O, P,
|
|
0xFF, 0xFF, 0xFF, 0xFF,
|
|
// Q, R, S, T,
|
|
0xFF, 0xFF, 0xFF, 3,
|
|
// U, V, W, X,
|
|
3, 0xFF, 0xFF, 0xFF,
|
|
// Y, Z, ., .
|
|
0xFF, 0xFF, 0xFF, 0xFF,
|
|
0xFF, 0xFF, 0xFF,
|
|
}
|
|
|
|
const ambiguousBaseCode = byte(0xFF)
|
|
|
|
// Error markers for k-mers of odd length ≤ 31
|
|
// For odd k ≤ 31, only k*2 bits are used (max 62 bits), leaving 2 high bits
|
|
// available for error coding in the top 2 bits (bits 62-63).
|
|
//
|
|
// Error codes are simple integers:
|
|
//
|
|
// 0 = no error
|
|
// 1 = error type 1
|
|
// 2 = error type 2
|
|
// 3 = error type 3
|
|
//
|
|
// Use SetKmerError(kmer, code) and GetKmerError(kmer) to manipulate error bits.
|
|
const (
|
|
KmerErrorMask uint64 = 0b11 << 62 // Mask to extract error bits (bits 62-63)
|
|
KmerSequenceMask uint64 = ^KmerErrorMask // Mask to extract sequence bits (bits 0-61)
|
|
)
|
|
|
|
// SetKmerError sets the error marker bits on a k-mer encoded value.
|
|
// Only valid for odd k-mer sizes ≤ 31 where 2 bits remain unused.
|
|
//
|
|
// Parameters:
|
|
// - kmer: the encoded k-mer value
|
|
// - errorCode: error code (0-3), where 0=no error, 1-3=error types
|
|
//
|
|
// Returns:
|
|
// - k-mer with error bits set
|
|
func SetKmerError(kmer uint64, errorCode uint64) uint64 {
|
|
return (kmer & KmerSequenceMask) | ((errorCode & 0b11) << 62)
|
|
}
|
|
|
|
// GetKmerError extracts the error marker bits from a k-mer encoded value.
|
|
//
|
|
// Returns:
|
|
// - error code (0-3) as raw value (not shifted)
|
|
func GetKmerError(kmer uint64) uint64 {
|
|
return (kmer & KmerErrorMask) >> 62
|
|
}
|
|
|
|
// ClearKmerError removes the error marker bits from a k-mer, returning
|
|
// just the sequence encoding.
|
|
//
|
|
// Returns:
|
|
// - k-mer with error bits cleared (set to 00)
|
|
func ClearKmerError(kmer uint64) uint64 {
|
|
return kmer & KmerSequenceMask
|
|
}
|
|
|
|
// EncodeKmer encodes a single k-mer sequence to uint64.
|
|
// This is the optimal zero-allocation function for encoding a single k-mer.
|
|
//
|
|
// Each nucleotide is encoded on 2 bits according to __single_base_code__:
|
|
// - A = 0 (00)
|
|
// - C = 1 (01)
|
|
// - G = 2 (10)
|
|
// - T/U = 3 (11)
|
|
//
|
|
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
// available for error markers if needed.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - encoded k-mer as uint64
|
|
// - panics if len(seq) != k or k is invalid
|
|
//
|
|
// Example:
|
|
//
|
|
// kmer := EncodeKmer([]byte("ACGT"), 4)
|
|
func EncodeKmer(seq []byte, k int) uint64 {
|
|
if k < 1 || k > 31 {
|
|
panic("k must be between 1 and 31")
|
|
}
|
|
if len(seq) != k {
|
|
panic("sequence length must equal k")
|
|
}
|
|
|
|
var kmer uint64
|
|
for i := 0; i < k; i++ {
|
|
kmer <<= 2
|
|
kmer |= uint64(__single_base_code__[seq[i]&31])
|
|
}
|
|
return kmer
|
|
}
|
|
|
|
// EncodeCanonicalKmer encodes a single k-mer sequence to its canonical form (uint64).
|
|
// Returns the lexicographically smaller of the k-mer and its reverse complement.
|
|
// This is the optimal zero-allocation function for encoding a single canonical k-mer.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - canonical k-mer as uint64
|
|
// - panics if len(seq) != k or k is invalid
|
|
//
|
|
// Example:
|
|
//
|
|
// canonical := EncodeCanonicalKmer([]byte("ACGT"), 4)
|
|
func EncodeCanonicalKmer(seq []byte, k int) uint64 {
|
|
if k < 1 || k > 31 {
|
|
panic("k must be between 1 and 31")
|
|
}
|
|
if len(seq) != k {
|
|
panic("sequence length must equal k")
|
|
}
|
|
|
|
rcShift := uint((k - 1) * 2)
|
|
|
|
var fwd, rvc uint64
|
|
for i := 0; i < k; i++ {
|
|
code := uint64(__single_base_code__[seq[i]&31])
|
|
fwd <<= 2
|
|
fwd |= code
|
|
rvc >>= 2
|
|
rvc |= (code ^ 3) << rcShift
|
|
}
|
|
|
|
if fwd <= rvc {
|
|
return fwd
|
|
}
|
|
return rvc
|
|
}
|
|
|
|
// DecodeKmer decodes a uint64 k-mer back to a DNA sequence.
|
|
// This function reuses a provided buffer to avoid allocation.
|
|
//
|
|
// Parameters:
|
|
// - kmer: encoded k-mer as uint64
|
|
// - k: k-mer size (number of nucleotides)
|
|
// - buffer: pre-allocated buffer of length >= k (if nil, allocates new slice)
|
|
//
|
|
// Returns:
|
|
// - decoded DNA sequence as []byte (lowercase acgt)
|
|
//
|
|
// Example:
|
|
//
|
|
// var buf [32]byte
|
|
// seq := DecodeKmer(kmer, 21, buf[:])
|
|
func DecodeKmer(kmer uint64, k int, buffer []byte) []byte {
|
|
var result []byte
|
|
if buffer == nil || len(buffer) < k {
|
|
result = make([]byte, k)
|
|
} else {
|
|
result = buffer[:k]
|
|
}
|
|
|
|
bases := [4]byte{'a', 'c', 'g', 't'}
|
|
for i := k - 1; i >= 0; i-- {
|
|
result[i] = bases[kmer&3]
|
|
kmer >>= 2
|
|
}
|
|
return result
|
|
}
|
|
|
|
// EncodeKmers converts a DNA sequence to a slice of encoded k-mers.
|
|
// Each nucleotide is encoded on 2 bits according to __single_base_code__:
|
|
// - A = 0 (00)
|
|
// - C = 1 (01)
|
|
// - G = 2 (10)
|
|
// - T/U = 3 (11)
|
|
//
|
|
// The function returns overlapping k-mers of size k encoded as uint64.
|
|
// For a sequence of length n, it returns n-k+1 k-mers.
|
|
//
|
|
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
// available for error markers (see SetKmerError).
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
//
|
|
// Returns:
|
|
// - slice of uint64 encoded k-mers
|
|
// - nil if sequence is shorter than k or k is invalid
|
|
func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
|
|
if k < 1 || k > 31 || len(seq) < k {
|
|
return nil
|
|
}
|
|
|
|
var result []uint64
|
|
if buffer == nil {
|
|
result = make([]uint64, 0, len(seq)-k+1)
|
|
} else {
|
|
result = (*buffer)[:0]
|
|
}
|
|
|
|
for kmer := range IterKmers(seq, k) {
|
|
result = append(result, kmer)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// IterKmers returns an iterator over all k-mers in the sequence.
|
|
// No intermediate slice is allocated, making it memory-efficient for
|
|
// processing k-mers one by one (e.g., adding to a Roaring Bitmap).
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - iterator yielding uint64 encoded k-mers
|
|
//
|
|
// Example:
|
|
//
|
|
// for kmer := range IterKmers(seq, 21) {
|
|
// bitmap.Add(kmer)
|
|
// }
|
|
func IterKmers(seq []byte, k int) iter.Seq[uint64] {
|
|
return func(yield func(uint64) bool) {
|
|
if k < 1 || k > 31 || len(seq) < k {
|
|
return
|
|
}
|
|
|
|
mask := uint64(1)<<(k*2) - 1
|
|
|
|
var kmer uint64
|
|
for i := 0; i < k; i++ {
|
|
kmer <<= 2
|
|
kmer |= uint64(__single_base_code__[seq[i]&31])
|
|
}
|
|
|
|
if !yield(kmer) {
|
|
return
|
|
}
|
|
|
|
for i := k; i < len(seq); i++ {
|
|
kmer <<= 2
|
|
kmer |= uint64(__single_base_code__[seq[i]&31])
|
|
kmer &= mask
|
|
|
|
if !yield(kmer) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// IterCanonicalKmersWithErrors returns an iterator over all canonical k-mers
|
|
// with error markers for ambiguous bases. No intermediate slice is allocated.
|
|
//
|
|
// Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V) are encoded as 0xFF and detected
|
|
// during k-mer construction. The error code in bits 62-63 indicates the number of
|
|
// ambiguous bases in each k-mer (0=clean, 1-3=error count).
|
|
//
|
|
// Only valid for odd k ≤ 31 where 2 bits remain unused for error markers.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U, and ambiguous bases)
|
|
// - k: k-mer size (must be odd, between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - iterator yielding uint64 canonical k-mers with error markers
|
|
//
|
|
// Example:
|
|
//
|
|
// for kmer := range IterCanonicalKmersWithErrors(seq, 21) {
|
|
// if GetKmerError(kmer) == 0 {
|
|
// bitmap.Add(kmer) // Only add clean k-mers
|
|
// }
|
|
// }
|
|
func IterCanonicalKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
|
|
return func(yield func(uint64) bool) {
|
|
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
|
return
|
|
}
|
|
|
|
mask := uint64(1)<<(k*2) - 1
|
|
|
|
rcShift := uint((k - 1) * 2)
|
|
|
|
ambiguousCount := 0
|
|
const ambiguousCode = byte(0xFF)
|
|
|
|
var fwd, rvc uint64
|
|
hasError := false
|
|
for i := 0; i < k; i++ {
|
|
code := __single_base_code_err__[seq[i]&31]
|
|
|
|
if code == ambiguousCode {
|
|
ambiguousCount++
|
|
hasError = true
|
|
code = 0
|
|
}
|
|
|
|
codeUint := uint64(code)
|
|
fwd <<= 2
|
|
fwd |= codeUint
|
|
rvc >>= 2
|
|
rvc |= (codeUint ^ 3) << rcShift
|
|
}
|
|
|
|
var canonical uint64
|
|
if fwd <= rvc {
|
|
canonical = fwd
|
|
} else {
|
|
canonical = rvc
|
|
}
|
|
|
|
if hasError {
|
|
errorCode := uint64(ambiguousCount)
|
|
if errorCode > 3 {
|
|
errorCode = 3
|
|
}
|
|
canonical = SetKmerError(canonical, errorCode)
|
|
}
|
|
|
|
if !yield(canonical) {
|
|
return
|
|
}
|
|
|
|
for i := k; i < len(seq); i++ {
|
|
outgoingCode := __single_base_code_err__[seq[i-k]&31]
|
|
if outgoingCode == ambiguousCode {
|
|
ambiguousCount--
|
|
}
|
|
|
|
code := __single_base_code_err__[seq[i]&31]
|
|
if code == ambiguousCode {
|
|
ambiguousCount++
|
|
code = 0
|
|
}
|
|
|
|
codeUint := uint64(code)
|
|
|
|
fwd <<= 2
|
|
fwd |= codeUint
|
|
fwd &= mask
|
|
|
|
rvc >>= 2
|
|
rvc |= (codeUint ^ 3) << rcShift
|
|
|
|
if fwd <= rvc {
|
|
canonical = fwd
|
|
} else {
|
|
canonical = rvc
|
|
}
|
|
|
|
if ambiguousCount > 0 {
|
|
errorCode := uint64(ambiguousCount)
|
|
if errorCode > 3 {
|
|
errorCode = 3
|
|
}
|
|
canonical = SetKmerError(canonical, errorCode)
|
|
}
|
|
|
|
if !yield(canonical) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// IterCanonicalKmers returns an iterator over all canonical k-mers.
|
|
// No intermediate slice is allocated, making it memory-efficient.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - iterator yielding uint64 canonical k-mers
|
|
//
|
|
// Example:
|
|
//
|
|
// for canonical := range IterCanonicalKmers(seq, 21) {
|
|
// bitmap.Add(canonical)
|
|
// }
|
|
func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
|
return func(yield func(uint64) bool) {
|
|
if k < 1 || k > 31 || len(seq) < k {
|
|
return
|
|
}
|
|
|
|
mask := uint64(1)<<(k*2) - 1
|
|
|
|
rcShift := uint((k - 1) * 2)
|
|
|
|
var fwd, rvc uint64
|
|
for i := 0; i < k; i++ {
|
|
code := uint64(__single_base_code__[seq[i]&31])
|
|
fwd <<= 2
|
|
fwd |= code
|
|
rvc >>= 2
|
|
rvc |= (code ^ 3) << rcShift
|
|
}
|
|
|
|
var canonical uint64
|
|
if fwd <= rvc {
|
|
canonical = fwd
|
|
} else {
|
|
canonical = rvc
|
|
}
|
|
|
|
if !yield(canonical) {
|
|
return
|
|
}
|
|
|
|
for i := k; i < len(seq); i++ {
|
|
code := uint64(__single_base_code__[seq[i]&31])
|
|
|
|
fwd <<= 2
|
|
fwd |= code
|
|
fwd &= mask
|
|
|
|
rvc >>= 2
|
|
rvc |= (code ^ 3) << rcShift
|
|
|
|
if fwd <= rvc {
|
|
canonical = fwd
|
|
} else {
|
|
canonical = rvc
|
|
}
|
|
|
|
if !yield(canonical) {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// SuperKmer represents a maximal subsequence where all consecutive k-mers
|
|
// share the same minimizer. A minimizer is the smallest canonical m-mer
|
|
// among the (k-m+1) m-mers contained in a k-mer.
|
|
type SuperKmer struct {
|
|
Minimizer uint64 // The canonical minimizer value (normalized m-mer)
|
|
Start int // Starting position in the original sequence (0-indexed)
|
|
End int // Ending position (exclusive, like Go slice notation)
|
|
Sequence []byte // The actual DNA subsequence [Start:End]
|
|
}
|
|
|
|
// dequeItem represents an element in the monotone deque used for
|
|
// tracking minimizers in a sliding window.
|
|
type dequeItem struct {
|
|
position int // Position of the m-mer in the sequence
|
|
canonical uint64 // Canonical (normalized) m-mer value
|
|
}
|
|
|
|
// ExtractSuperKmers extracts super k-mers from a DNA sequence.
|
|
// A super k-mer is a maximal subsequence where all consecutive k-mers
|
|
// share the same minimizer. The minimizer of a k-mer is the smallest
|
|
// canonical m-mer among its (k-m+1) constituent m-mers.
|
|
//
|
|
// The algorithm uses:
|
|
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
|
|
// - Monotone deque for O(1) amortized minimizer tracking per position
|
|
//
|
|
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
// available for error markers if needed.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between m+1 and 31)
|
|
// - m: minimizer size (must be between 1 and k-1)
|
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
//
|
|
// Returns:
|
|
// - slice of SuperKmer structs representing maximal subsequences
|
|
// - nil if parameters are invalid or sequence is too short
|
|
//
|
|
// Time complexity: O(n) where n is the sequence length
|
|
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
|
|
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
|
|
if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
|
|
return nil
|
|
}
|
|
|
|
var result []SuperKmer
|
|
if buffer == nil {
|
|
estimatedSize := len(seq) / k
|
|
if estimatedSize < 1 {
|
|
estimatedSize = 1
|
|
}
|
|
result = make([]SuperKmer, 0, estimatedSize)
|
|
} else {
|
|
result = (*buffer)[:0]
|
|
}
|
|
|
|
deque := make([]dequeItem, 0, k-m+1)
|
|
|
|
mMask := uint64(1)<<(m*2) - 1
|
|
rcShift := uint((m - 1) * 2)
|
|
|
|
var fwdMmer, rvcMmer uint64
|
|
for i := 0; i < m-1 && i < len(seq); i++ {
|
|
code := uint64(__single_base_code__[seq[i]&31])
|
|
fwdMmer = (fwdMmer << 2) | code
|
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
}
|
|
|
|
superKmerStart := 0
|
|
var currentMinimizer uint64
|
|
firstKmer := true
|
|
|
|
for pos := m - 1; pos < len(seq); pos++ {
|
|
code := uint64(__single_base_code__[seq[pos]&31])
|
|
fwdMmer = ((fwdMmer << 2) | code) & mMask
|
|
rvcMmer = (rvcMmer >> 2) | ((code ^ 3) << rcShift)
|
|
|
|
canonical := fwdMmer
|
|
if rvcMmer < fwdMmer {
|
|
canonical = rvcMmer
|
|
}
|
|
|
|
mmerPos := pos - m + 1
|
|
|
|
if pos >= k-1 {
|
|
windowStart := pos - k + 1
|
|
for len(deque) > 0 && deque[0].position < windowStart {
|
|
deque = deque[1:]
|
|
}
|
|
}
|
|
|
|
for len(deque) > 0 && deque[len(deque)-1].canonical >= canonical {
|
|
deque = deque[:len(deque)-1]
|
|
}
|
|
|
|
deque = append(deque, dequeItem{position: mmerPos, canonical: canonical})
|
|
|
|
if pos >= k-1 {
|
|
newMinimizer := deque[0].canonical
|
|
kmerStart := pos - k + 1
|
|
|
|
if firstKmer {
|
|
currentMinimizer = newMinimizer
|
|
firstKmer = false
|
|
} else if newMinimizer != currentMinimizer {
|
|
endPos := kmerStart + k - 1
|
|
superKmer := SuperKmer{
|
|
Minimizer: currentMinimizer,
|
|
Start: superKmerStart,
|
|
End: endPos,
|
|
Sequence: seq[superKmerStart:endPos],
|
|
}
|
|
result = append(result, superKmer)
|
|
|
|
superKmerStart = kmerStart
|
|
currentMinimizer = newMinimizer
|
|
}
|
|
}
|
|
}
|
|
|
|
if !firstKmer {
|
|
superKmer := SuperKmer{
|
|
Minimizer: currentMinimizer,
|
|
Start: superKmerStart,
|
|
End: len(seq),
|
|
Sequence: seq[superKmerStart:],
|
|
}
|
|
result = append(result, superKmer)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ReverseComplement computes the reverse complement of an encoded k-mer.
|
|
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
|
|
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
|
|
// The reverse swaps the order of 2-bit pairs.
|
|
//
|
|
// For k-mers with error markers (top 2 bits), the error bits are preserved
|
|
// and transferred to the reverse complement.
|
|
//
|
|
// Parameters:
|
|
// - kmer: the encoded k-mer (possibly with error bits in positions 62-63)
|
|
// - k: the k-mer size (number of nucleotides)
|
|
//
|
|
// Returns:
|
|
// - the reverse complement of the k-mer with error bits preserved
|
|
func ReverseComplement(kmer uint64, k int) uint64 {
|
|
errorBits := kmer & KmerErrorMask
|
|
|
|
mask := uint64(1)<<(k*2) - 1
|
|
rc := (^kmer) & mask
|
|
|
|
rc = ((rc & 0x3333333333333333) << 2) | ((rc & 0xCCCCCCCCCCCCCCCC) >> 2)
|
|
rc = ((rc & 0x0F0F0F0F0F0F0F0F) << 4) | ((rc & 0xF0F0F0F0F0F0F0F0) >> 4)
|
|
rc = ((rc & 0x00FF00FF00FF00FF) << 8) | ((rc & 0xFF00FF00FF00FF00) >> 8)
|
|
rc = ((rc & 0x0000FFFF0000FFFF) << 16) | ((rc & 0xFFFF0000FFFF0000) >> 16)
|
|
rc = (rc << 32) | (rc >> 32)
|
|
|
|
rc >>= (64 - k*2)
|
|
|
|
rc |= errorBits
|
|
|
|
return rc
|
|
}
|
|
|
|
// CanonicalKmer returns the lexicographically smaller of a k-mer and its
|
|
// reverse complement. This canonical form ensures that a k-mer and its
|
|
// reverse complement map to the same value.
|
|
//
|
|
// This implements REVERSE COMPLEMENT canonicalization (biological canonical form).
|
|
//
|
|
// Parameters:
|
|
// - kmer: the encoded k-mer
|
|
// - k: the k-mer size (number of nucleotides)
|
|
//
|
|
// Returns:
|
|
// - the canonical k-mer
|
|
func CanonicalKmer(kmer uint64, k int) uint64 {
|
|
rc := ReverseComplement(kmer, k)
|
|
if rc < kmer {
|
|
return rc
|
|
}
|
|
return kmer
|
|
}
|
|
|
|
// NormalizeCircular returns the lexicographically smallest circular rotation
|
|
// of a k-mer. This is used for entropy calculations in low-complexity masking.
|
|
//
|
|
// This implements CIRCULAR PERMUTATION normalization (rotation-based canonicalization).
|
|
// Example: ACGT → min(ACGT, CGTA, GTAC, TACG) by circular rotation
|
|
//
|
|
// This is DIFFERENT from NormalizeKmer which uses reverse complement.
|
|
//
|
|
// Parameters:
|
|
// - kmer: the encoded k-mer
|
|
// - k: the k-mer size (number of nucleotides)
|
|
//
|
|
// Returns:
|
|
// - the lexicographically smallest circular rotation
|
|
//
|
|
// Time complexity: O(k) - checks all k rotations
|
|
func NormalizeCircular(kmer uint64, k int) uint64 {
|
|
if k < 1 || k > 31 {
|
|
return kmer
|
|
}
|
|
|
|
mask := uint64(1)<<(k*2) - 1
|
|
canonical := kmer
|
|
current := kmer
|
|
|
|
// Try all k rotations
|
|
for i := 0; i < k; i++ {
|
|
// Rotate: take top 2 bits, shift left, add to bottom
|
|
top := (current >> ((k - 1) * 2)) & 3
|
|
current = ((current << 2) | top) & mask
|
|
|
|
if current < canonical {
|
|
canonical = current
|
|
}
|
|
}
|
|
|
|
return canonical
|
|
}
|
|
|
|
// EncodeCircularCanonicalKmer encodes a k-mer and returns its lexicographically
|
|
// smallest circular rotation. This is optimized for single k-mer encoding with
|
|
// circular canonicalization.
|
|
//
|
|
// This implements CIRCULAR PERMUTATION canonicalization, used for entropy-based
|
|
// low-complexity masking. This is DIFFERENT from EncodeCanonicalKmer which
|
|
// uses reverse complement canonicalization.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
//
|
|
// Returns:
|
|
// - canonical k-mer as uint64 (smallest circular rotation)
|
|
// - panics if len(seq) != k or k is invalid
|
|
//
|
|
// Example:
|
|
//
|
|
// canonical := EncodeCircularCanonicalKmer([]byte("ACGT"), 4)
|
|
func EncodeCircularCanonicalKmer(seq []byte, k int) uint64 {
|
|
kmer := EncodeKmer(seq, k)
|
|
return NormalizeCircular(kmer, k)
|
|
}
|
|
|
|
// CanonicalCircularKmerCount returns the number of unique canonical k-mers
|
|
// under circular permutation normalization for DNA sequences (4-letter alphabet).
|
|
//
|
|
// This counts equivalence classes where k-mers are considered the same if one
|
|
// is a circular rotation of another (e.g., "ACGT", "CGTA", "GTAC", "TACG" are
|
|
// all equivalent).
|
|
//
|
|
// Uses Moreau's necklace-counting formula for exact counts:
|
|
//
|
|
// N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
|
//
|
|
// where the sum is over all divisors d of n, and φ is Euler's totient function.
|
|
//
|
|
// Parameters:
|
|
// - k: k-mer size
|
|
//
|
|
// Returns:
|
|
// - number of unique canonical k-mers under circular rotation
|
|
//
|
|
// Example:
|
|
//
|
|
// count := CanonicalCircularKmerCount(4) // Returns 70 (not 256)
|
|
func CanonicalCircularKmerCount(k int) int {
|
|
// Hardcoded exact counts for k=1 to 6 (optimization)
|
|
switch k {
|
|
case 1:
|
|
return 4
|
|
case 2:
|
|
return 10
|
|
case 3:
|
|
return 24
|
|
case 4:
|
|
return 70
|
|
case 5:
|
|
return 208
|
|
case 6:
|
|
return 700
|
|
default:
|
|
// For k>6, use Moreau's necklace-counting formula
|
|
return necklaceCount(k, 4)
|
|
}
|
|
}
|
|
|
|
// eulerTotient computes Euler's totient function φ(n), which counts
|
|
// the number of integers from 1 to n that are coprime with n.
|
|
func eulerTotient(n int) int {
|
|
if n <= 0 {
|
|
return 0
|
|
}
|
|
|
|
result := n
|
|
|
|
// Process all prime factors
|
|
for p := 2; p*p <= n; p++ {
|
|
if n%p == 0 {
|
|
// Remove all occurrences of p
|
|
for n%p == 0 {
|
|
n /= p
|
|
}
|
|
// Apply: φ(n) = n * (1 - 1/p) = n * (p-1)/p
|
|
result -= result / p
|
|
}
|
|
}
|
|
|
|
// If n is still greater than 1, then it's a prime factor
|
|
if n > 1 {
|
|
result -= result / n
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// divisors returns all divisors of n in ascending order.
|
|
func divisors(n int) []int {
|
|
if n <= 0 {
|
|
return []int{}
|
|
}
|
|
|
|
divs := []int{}
|
|
for i := 1; i*i <= n; i++ {
|
|
if n%i == 0 {
|
|
divs = append(divs, i)
|
|
if i != n/i {
|
|
divs = append(divs, n/i)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Bubble sort in ascending order
|
|
for i := 0; i < len(divs)-1; i++ {
|
|
for j := i + 1; j < len(divs); j++ {
|
|
if divs[i] > divs[j] {
|
|
divs[i], divs[j] = divs[j], divs[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
return divs
|
|
}
|
|
|
|
// necklaceCount computes the number of distinct necklaces (equivalence classes
|
|
// under rotation) for sequences of length n over an alphabet of size a.
|
|
// Uses Moreau's necklace-counting formula:
|
|
//
|
|
// N(n, a) = (1/n) * Σ φ(d) * a^(n/d)
|
|
//
|
|
// where the sum is over all divisors d of n, and φ is Euler's totient function.
|
|
func necklaceCount(n, alphabetSize int) int {
|
|
if n <= 0 {
|
|
return 0
|
|
}
|
|
|
|
divs := divisors(n)
|
|
sum := 0
|
|
|
|
for _, d := range divs {
|
|
// Compute a^(n/d)
|
|
power := 1
|
|
exp := n / d
|
|
for i := 0; i < exp; i++ {
|
|
power *= alphabetSize
|
|
}
|
|
|
|
sum += eulerTotient(d) * power
|
|
}
|
|
|
|
return sum / n
|
|
}
|
|
|
|
// EncodeCanonicalKmersWithErrors converts a DNA sequence to a slice of canonical k-mers
|
|
// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
|
|
//
|
|
// Ambiguous bases are encoded as 0xFF by __single_base_code__ and detected during
|
|
// k-mer construction. The error code in bits 62-63 indicates the number of ambiguous
|
|
// bases in each k-mer:
|
|
// - errorCode 0: no ambiguous bases (clean k-mer)
|
|
// - errorCode 1: 1 ambiguous base
|
|
// - errorCode 2: 2 ambiguous bases
|
|
// - errorCode 3: 3 or more ambiguous bases
|
|
//
|
|
// Only valid for odd k ≤ 31 where 2 bits remain unused for error markers.
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U, and ambiguous bases)
|
|
// - k: k-mer size (must be odd, between 1 and 31)
|
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
//
|
|
// Returns:
|
|
// - slice of uint64 canonical k-mers with error markers
|
|
// - nil if sequence is shorter than k, k is invalid, or k is even
|
|
func EncodeCanonicalKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 {
|
|
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
|
return nil
|
|
}
|
|
|
|
var result []uint64
|
|
if buffer == nil {
|
|
result = make([]uint64, 0, len(seq)-k+1)
|
|
} else {
|
|
result = (*buffer)[:0]
|
|
}
|
|
|
|
for kmer := range IterCanonicalKmersWithErrors(seq, k) {
|
|
result = append(result, kmer)
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// EncodeCanonicalKmers converts a DNA sequence to a slice of canonical k-mers.
|
|
// Each k-mer is replaced by the lexicographically smaller of itself and its
|
|
// reverse complement. This ensures that forward and reverse complement sequences
|
|
// produce the same k-mer set.
|
|
//
|
|
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
|
// available for error markers (see SetKmerError).
|
|
//
|
|
// Parameters:
|
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
|
// - k: k-mer size (must be between 1 and 31)
|
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
|
//
|
|
// Returns:
|
|
// - slice of uint64 canonical k-mers
|
|
// - nil if sequence is shorter than k or k is invalid
|
|
func EncodeCanonicalKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
|
|
if k < 1 || k > 31 || len(seq) < k {
|
|
return nil
|
|
}
|
|
|
|
var result []uint64
|
|
if buffer == nil {
|
|
result = make([]uint64, 0, len(seq)-k+1)
|
|
} else {
|
|
result = (*buffer)[:0]
|
|
}
|
|
|
|
for kmer := range IterCanonicalKmers(seq, k) {
|
|
result = append(result, kmer)
|
|
}
|
|
|
|
return result
|
|
}
|