mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
Refactor k-mer encoding functions to use 'canonical' terminology
This commit refactors all k-mer encoding and normalization functions to consistently use 'canonical' instead of 'normalized' terminology. This includes renaming functions like EncodeNormalizedKmer to EncodeCanonicalKmer, IterNormalizedKmers to IterCanonicalKmers, and NormalizeKmer to CanonicalKmer. The change aligns the API with biological conventions where 'canonical' refers to the lexicographically smallest representation of a k-mer and its reverse complement. All related documentation and examples have been updated accordingly. The commit also updates the version file with a new commit hash.
This commit is contained in:
@@ -108,22 +108,22 @@ func EncodeKmer(seq []byte, k int) uint64 {
|
|||||||
return kmer
|
return kmer
|
||||||
}
|
}
|
||||||
|
|
||||||
// EncodeNormalizedKmer encodes a single k-mer sequence to its canonical form (uint64).
|
// EncodeCanonicalKmer encodes a single k-mer sequence to its canonical form (uint64).
|
||||||
// Returns the lexicographically smaller of the k-mer and its reverse complement.
|
// Returns the lexicographically smaller of the k-mer and its reverse complement.
|
||||||
// This is the optimal zero-allocation function for encoding a single normalized k-mer.
|
// This is the optimal zero-allocation function for encoding a single canonical k-mer.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
// - k: k-mer size (must be between 1 and 31)
|
// - k: k-mer size (must be between 1 and 31)
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - normalized k-mer as uint64
|
// - canonical k-mer as uint64
|
||||||
// - panics if len(seq) != k or k is invalid
|
// - panics if len(seq) != k or k is invalid
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
// canonical := EncodeNormalizedKmer([]byte("ACGT"), 4)
|
// canonical := EncodeCanonicalKmer([]byte("ACGT"), 4)
|
||||||
func EncodeNormalizedKmer(seq []byte, k int) uint64 {
|
func EncodeCanonicalKmer(seq []byte, k int) uint64 {
|
||||||
if k < 1 || k > 31 {
|
if k < 1 || k > 31 {
|
||||||
panic("k must be between 1 and 31")
|
panic("k must be between 1 and 31")
|
||||||
}
|
}
|
||||||
@@ -265,7 +265,7 @@ func IterKmers(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// IterNormalizedKmersWithErrors returns an iterator over all normalized k-mers
|
// IterCanonicalKmersWithErrors returns an iterator over all canonical k-mers
|
||||||
// with error markers for ambiguous bases. No intermediate slice is allocated.
|
// with error markers for ambiguous bases. No intermediate slice is allocated.
|
||||||
//
|
//
|
||||||
// Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V) are encoded as 0xFF and detected
|
// Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V) are encoded as 0xFF and detected
|
||||||
@@ -279,16 +279,16 @@ func IterKmers(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
// - k: k-mer size (must be odd, between 1 and 31)
|
// - k: k-mer size (must be odd, between 1 and 31)
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - iterator yielding uint64 normalized k-mers with error markers
|
// - iterator yielding uint64 canonical k-mers with error markers
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
// for kmer := range IterNormalizedKmersWithErrors(seq, 21) {
|
// for kmer := range IterCanonicalKmersWithErrors(seq, 21) {
|
||||||
// if GetKmerError(kmer) == 0 {
|
// if GetKmerError(kmer) == 0 {
|
||||||
// bitmap.Add(kmer) // Only add clean k-mers
|
// bitmap.Add(kmer) // Only add clean k-mers
|
||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
|
func IterCanonicalKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
|
||||||
return func(yield func(uint64) bool) {
|
return func(yield func(uint64) bool) {
|
||||||
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
||||||
return
|
return
|
||||||
@@ -380,7 +380,7 @@ func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// IterNormalizedKmers returns an iterator over all normalized (canonical) k-mers.
|
// IterCanonicalKmers returns an iterator over all canonical k-mers.
|
||||||
// No intermediate slice is allocated, making it memory-efficient.
|
// No intermediate slice is allocated, making it memory-efficient.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
@@ -388,14 +388,14 @@ func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
|
|||||||
// - k: k-mer size (must be between 1 and 31)
|
// - k: k-mer size (must be between 1 and 31)
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - iterator yielding uint64 normalized k-mers
|
// - iterator yielding uint64 canonical k-mers
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
// for canonical := range IterNormalizedKmers(seq, 21) {
|
// for canonical := range IterCanonicalKmers(seq, 21) {
|
||||||
// bitmap.Add(canonical)
|
// bitmap.Add(canonical)
|
||||||
// }
|
// }
|
||||||
func IterNormalizedKmers(seq []byte, k int) iter.Seq[uint64] {
|
func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
|
||||||
return func(yield func(uint64) bool) {
|
return func(yield func(uint64) bool) {
|
||||||
if k < 1 || k > 31 || len(seq) < k {
|
if k < 1 || k > 31 || len(seq) < k {
|
||||||
return
|
return
|
||||||
@@ -615,19 +615,19 @@ func ReverseComplement(kmer uint64, k int) uint64 {
|
|||||||
return rc
|
return rc
|
||||||
}
|
}
|
||||||
|
|
||||||
// NormalizeKmer returns the lexicographically smaller of a k-mer and its
|
// CanonicalKmer returns the lexicographically smaller of a k-mer and its
|
||||||
// reverse complement. This canonical form ensures that a k-mer and its
|
// reverse complement. This canonical form ensures that a k-mer and its
|
||||||
// reverse complement map to the same value.
|
// reverse complement map to the same value.
|
||||||
//
|
//
|
||||||
// This implements REVERSE COMPLEMENT normalization (biological canonicalization).
|
// This implements REVERSE COMPLEMENT canonicalization (biological canonical form).
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - kmer: the encoded k-mer
|
// - kmer: the encoded k-mer
|
||||||
// - k: the k-mer size (number of nucleotides)
|
// - k: the k-mer size (number of nucleotides)
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - the canonical (normalized) k-mer
|
// - the canonical k-mer
|
||||||
func NormalizeKmer(kmer uint64, k int) uint64 {
|
func CanonicalKmer(kmer uint64, k int) uint64 {
|
||||||
rc := ReverseComplement(kmer, k)
|
rc := ReverseComplement(kmer, k)
|
||||||
if rc < kmer {
|
if rc < kmer {
|
||||||
return rc
|
return rc
|
||||||
@@ -674,26 +674,26 @@ func NormalizeCircular(kmer uint64, k int) uint64 {
|
|||||||
return canonical
|
return canonical
|
||||||
}
|
}
|
||||||
|
|
||||||
// EncodeCircularNormalizedKmer encodes a k-mer and returns its lexicographically
|
// EncodeCircularCanonicalKmer encodes a k-mer and returns its lexicographically
|
||||||
// smallest circular rotation. This is optimized for single k-mer encoding with
|
// smallest circular rotation. This is optimized for single k-mer encoding with
|
||||||
// circular normalization.
|
// circular canonicalization.
|
||||||
//
|
//
|
||||||
// This implements CIRCULAR PERMUTATION normalization, used for entropy-based
|
// This implements CIRCULAR PERMUTATION canonicalization, used for entropy-based
|
||||||
// low-complexity masking. This is DIFFERENT from EncodeNormalizedKmer which
|
// low-complexity masking. This is DIFFERENT from EncodeCanonicalKmer which
|
||||||
// uses reverse complement normalization.
|
// uses reverse complement canonicalization.
|
||||||
//
|
//
|
||||||
// Parameters:
|
// Parameters:
|
||||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||||
// - k: k-mer size (must be between 1 and 31)
|
// - k: k-mer size (must be between 1 and 31)
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - normalized k-mer as uint64 (smallest circular rotation)
|
// - canonical k-mer as uint64 (smallest circular rotation)
|
||||||
// - panics if len(seq) != k or k is invalid
|
// - panics if len(seq) != k or k is invalid
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
// canonical := EncodeCircularNormalizedKmer([]byte("ACGT"), 4)
|
// canonical := EncodeCircularCanonicalKmer([]byte("ACGT"), 4)
|
||||||
func EncodeCircularNormalizedKmer(seq []byte, k int) uint64 {
|
func EncodeCircularCanonicalKmer(seq []byte, k int) uint64 {
|
||||||
kmer := EncodeKmer(seq, k)
|
kmer := EncodeKmer(seq, k)
|
||||||
return NormalizeCircular(kmer, k)
|
return NormalizeCircular(kmer, k)
|
||||||
}
|
}
|
||||||
@@ -827,7 +827,7 @@ func necklaceCount(n, alphabetSize int) int {
|
|||||||
return sum / n
|
return sum / n
|
||||||
}
|
}
|
||||||
|
|
||||||
// EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers
|
// EncodeCanonicalKmersWithErrors converts a DNA sequence to a slice of canonical k-mers
|
||||||
// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
|
// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
|
||||||
//
|
//
|
||||||
// Ambiguous bases are encoded as 0xFF by __single_base_code__ and detected during
|
// Ambiguous bases are encoded as 0xFF by __single_base_code__ and detected during
|
||||||
@@ -846,9 +846,9 @@ func necklaceCount(n, alphabetSize int) int {
|
|||||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - slice of uint64 normalized k-mers with error markers
|
// - slice of uint64 canonical k-mers with error markers
|
||||||
// - nil if sequence is shorter than k, k is invalid, or k is even
|
// - nil if sequence is shorter than k, k is invalid, or k is even
|
||||||
func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 {
|
func EncodeCanonicalKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 {
|
||||||
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -860,14 +860,14 @@ func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint
|
|||||||
result = (*buffer)[:0]
|
result = (*buffer)[:0]
|
||||||
}
|
}
|
||||||
|
|
||||||
for kmer := range IterNormalizedKmersWithErrors(seq, k) {
|
for kmer := range IterCanonicalKmersWithErrors(seq, k) {
|
||||||
result = append(result, kmer)
|
result = append(result, kmer)
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// EncodeNormalizedKmers converts a DNA sequence to a slice of normalized k-mers.
|
// EncodeCanonicalKmers converts a DNA sequence to a slice of canonical k-mers.
|
||||||
// Each k-mer is replaced by the lexicographically smaller of itself and its
|
// Each k-mer is replaced by the lexicographically smaller of itself and its
|
||||||
// reverse complement. This ensures that forward and reverse complement sequences
|
// reverse complement. This ensures that forward and reverse complement sequences
|
||||||
// produce the same k-mer set.
|
// produce the same k-mer set.
|
||||||
@@ -881,9 +881,9 @@ func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint
|
|||||||
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
|
||||||
//
|
//
|
||||||
// Returns:
|
// Returns:
|
||||||
// - slice of uint64 normalized k-mers
|
// - slice of uint64 canonical k-mers
|
||||||
// - nil if sequence is shorter than k or k is invalid
|
// - nil if sequence is shorter than k or k is invalid
|
||||||
func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
|
func EncodeCanonicalKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
|
||||||
if k < 1 || k > 31 || len(seq) < k {
|
if k < 1 || k > 31 || len(seq) < k {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -895,7 +895,7 @@ func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
|
|||||||
result = (*buffer)[:0]
|
result = (*buffer)[:0]
|
||||||
}
|
}
|
||||||
|
|
||||||
for kmer := range IterNormalizedKmers(seq, k) {
|
for kmer := range IterCanonicalKmers(seq, k) {
|
||||||
result = append(result, kmer)
|
result = append(result, kmer)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
|
|||||||
// Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
|
// Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
|
||||||
func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
|
func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
|
||||||
rawSeq := seq.Sequence()
|
rawSeq := seq.Sequence()
|
||||||
for canonical := range IterNormalizedKmers(rawSeq, ff.K()) {
|
for canonical := range IterCanonicalKmers(rawSeq, ff.K()) {
|
||||||
ff.AddKmerCode(canonical)
|
ff.AddKmerCode(canonical)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,9 +45,9 @@ func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddNormalizedKmerCode ajoute un k-mer encodé normalisé au filtre
|
// AddCanonicalKmerCode ajoute un k-mer encodé canonique au filtre
|
||||||
func (ff *FrequencyFilter) AddNormalizedKmerCode(kmer uint64) {
|
func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) {
|
||||||
canonical := NormalizeKmer(kmer, ff.K())
|
canonical := CanonicalKmer(kmer, ff.K())
|
||||||
ff.AddKmerCode(canonical)
|
ff.AddKmerCode(canonical)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,11 +59,11 @@ func (ff *FrequencyFilter) AddKmer(seq []byte) {
|
|||||||
ff.AddKmerCode(kmer)
|
ff.AddKmerCode(kmer)
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddNormalizedKmer ajoute un k-mer normalisé au filtre en encodant la séquence
|
// AddCanonicalKmer ajoute un k-mer canonique au filtre en encodant la séquence
|
||||||
// La séquence doit avoir exactement k nucléotides
|
// La séquence doit avoir exactement k nucléotides
|
||||||
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
||||||
func (ff *FrequencyFilter) AddNormalizedKmer(seq []byte) {
|
func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) {
|
||||||
canonical := EncodeNormalizedKmer(seq, ff.K())
|
canonical := EncodeCanonicalKmer(seq, ff.K())
|
||||||
ff.AddKmerCode(canonical)
|
ff.AddKmerCode(canonical)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -183,14 +183,14 @@ func (ff *FrequencyFilter) Load(path string) error {
|
|||||||
|
|
||||||
// Contains vérifie si un k-mer a atteint la fréquence minimale
|
// Contains vérifie si un k-mer a atteint la fréquence minimale
|
||||||
func (ff *FrequencyFilter) Contains(kmer uint64) bool {
|
func (ff *FrequencyFilter) Contains(kmer uint64) bool {
|
||||||
canonical := NormalizeKmer(kmer, ff.K())
|
canonical := CanonicalKmer(kmer, ff.K())
|
||||||
return ff.Get(ff.MinFreq - 1).Contains(canonical)
|
return ff.Get(ff.MinFreq - 1).Contains(canonical)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetFrequency retourne la fréquence approximative d'un k-mer
|
// GetFrequency retourne la fréquence approximative d'un k-mer
|
||||||
// Retourne le niveau maximum atteint (freq ≥ niveau)
|
// Retourne le niveau maximum atteint (freq ≥ niveau)
|
||||||
func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
|
func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
|
||||||
canonical := NormalizeKmer(kmer, ff.K())
|
canonical := CanonicalKmer(kmer, ff.K())
|
||||||
|
|
||||||
freq := 0
|
freq := 0
|
||||||
for i := 0; i < ff.MinFreq; i++ {
|
for i := 0; i < ff.MinFreq; i++ {
|
||||||
|
|||||||
@@ -44,9 +44,9 @@ func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
|||||||
ks.bitmap.Add(kmer)
|
ks.bitmap.Add(kmer)
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddNormalizedKmerCode ajoute un k-mer encodé normalisé à l'ensemble
|
// AddCanonicalKmerCode ajoute un k-mer encodé canonique à l'ensemble
|
||||||
func (ks *KmerSet) AddNormalizedKmerCode(kmer uint64) {
|
func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
|
||||||
canonical := NormalizeKmer(kmer, ks.k)
|
canonical := CanonicalKmer(kmer, ks.k)
|
||||||
ks.bitmap.Add(canonical)
|
ks.bitmap.Add(canonical)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,11 +58,11 @@ func (ks *KmerSet) AddKmer(seq []byte) {
|
|||||||
ks.bitmap.Add(kmer)
|
ks.bitmap.Add(kmer)
|
||||||
}
|
}
|
||||||
|
|
||||||
// AddNormalizedKmer ajoute un k-mer normalisé à l'ensemble en encodant la séquence
|
// AddCanonicalKmer ajoute un k-mer canonique à l'ensemble en encodant la séquence
|
||||||
// La séquence doit avoir exactement k nucléotides
|
// La séquence doit avoir exactement k nucléotides
|
||||||
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
||||||
func (ks *KmerSet) AddNormalizedKmer(seq []byte) {
|
func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
|
||||||
canonical := EncodeNormalizedKmer(seq, ks.k)
|
canonical := EncodeCanonicalKmer(seq, ks.k)
|
||||||
ks.bitmap.Add(canonical)
|
ks.bitmap.Add(canonical)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -70,7 +70,7 @@ func (ks *KmerSet) AddNormalizedKmer(seq []byte) {
|
|||||||
// Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
|
// Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
|
||||||
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
||||||
rawSeq := seq.Sequence()
|
rawSeq := seq.Sequence()
|
||||||
for canonical := range IterNormalizedKmers(rawSeq, ks.k) {
|
for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
|
||||||
ks.bitmap.Add(canonical)
|
ks.bitmap.Add(canonical)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "6c6c369"
|
var _Commit = "16f72e6"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
Reference in New Issue
Block a user