Refactor k-mer encoding functions to use 'canonical' terminology

This commit refactors all k-mer encoding and normalization functions to consistently use 'canonical' instead of 'normalized' terminology. This includes renaming functions like EncodeNormalizedKmer to EncodeCanonicalKmer, IterNormalizedKmers to IterCanonicalKmers, and NormalizeKmer to CanonicalKmer. The change aligns the API with biological conventions where 'canonical' refers to the lexicographically smallest representation of a k-mer and its reverse complement. All related documentation and examples have been updated accordingly. The commit also updates the version file with a new commit hash.
2026-06-24 01:31:00 +00:00 · 2026-02-05 16:14:24 +01:00
parent 16f72e6305
commit 09ac15a76b
4 changed files with 50 additions and 50 deletions
@@ -108,22 +108,22 @@ func EncodeKmer(seq []byte, k int) uint64 {
 	return kmer
 }
-// EncodeNormalizedKmer encodes a single k-mer sequence to its canonical form (uint64).
+// EncodeCanonicalKmer encodes a single k-mer sequence to its canonical form (uint64).
 // Returns the lexicographically smaller of the k-mer and its reverse complement.
-// This is the optimal zero-allocation function for encoding a single normalized k-mer.
+// This is the optimal zero-allocation function for encoding a single canonical k-mer.
 //
 // Parameters:
 //   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
 //   - k: k-mer size (must be between 1 and 31)
 //
 // Returns:
-//   - normalized k-mer as uint64
+//   - canonical k-mer as uint64
 //   - panics if len(seq) != k or k is invalid
 //
 // Example:
 //
-//	canonical := EncodeNormalizedKmer([]byte("ACGT"), 4)
+//	canonical := EncodeCanonicalKmer([]byte("ACGT"), 4)
-func EncodeNormalizedKmer(seq []byte, k int) uint64 {
+func EncodeCanonicalKmer(seq []byte, k int) uint64 {
 	if k < 1 || k > 31 {
 		panic("k must be between 1 and 31")
 	}
@@ -265,7 +265,7 @@ func IterKmers(seq []byte, k int) iter.Seq[uint64] {
 	}
 }
-// IterNormalizedKmersWithErrors returns an iterator over all normalized k-mers
+// IterCanonicalKmersWithErrors returns an iterator over all canonical k-mers
 // with error markers for ambiguous bases. No intermediate slice is allocated.
 //
 // Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V) are encoded as 0xFF and detected
@@ -279,16 +279,16 @@ func IterKmers(seq []byte, k int) iter.Seq[uint64] {
 //   - k: k-mer size (must be odd, between 1 and 31)
 //
 // Returns:
-//   - iterator yielding uint64 normalized k-mers with error markers
+//   - iterator yielding uint64 canonical k-mers with error markers
 //
 // Example:
 //
-//	for kmer := range IterNormalizedKmersWithErrors(seq, 21) {
+//	for kmer := range IterCanonicalKmersWithErrors(seq, 21) {
 //	    if GetKmerError(kmer) == 0 {
 //	        bitmap.Add(kmer) // Only add clean k-mers
 //	    }
 //	}
-func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
+func IterCanonicalKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
 	return func(yield func(uint64) bool) {
 		if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
 			return
@@ -380,7 +380,7 @@ func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
 	}
 }
-// IterNormalizedKmers returns an iterator over all normalized (canonical) k-mers.
+// IterCanonicalKmers returns an iterator over all canonical k-mers.
 // No intermediate slice is allocated, making it memory-efficient.
 //
 // Parameters:
@@ -388,14 +388,14 @@ func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] {
 //   - k: k-mer size (must be between 1 and 31)
 //
 // Returns:
-//   - iterator yielding uint64 normalized k-mers
+//   - iterator yielding uint64 canonical k-mers
 //
 // Example:
 //
-//	for canonical := range IterNormalizedKmers(seq, 21) {
+//	for canonical := range IterCanonicalKmers(seq, 21) {
 //	    bitmap.Add(canonical)
 //	}
-func IterNormalizedKmers(seq []byte, k int) iter.Seq[uint64] {
+func IterCanonicalKmers(seq []byte, k int) iter.Seq[uint64] {
 	return func(yield func(uint64) bool) {
 		if k < 1 || k > 31 || len(seq) < k {
 			return
@@ -615,19 +615,19 @@ func ReverseComplement(kmer uint64, k int) uint64 {
 	return rc
 }
-// NormalizeKmer returns the lexicographically smaller of a k-mer and its
+// CanonicalKmer returns the lexicographically smaller of a k-mer and its
 // reverse complement. This canonical form ensures that a k-mer and its
 // reverse complement map to the same value.
 //
-// This implements REVERSE COMPLEMENT normalization (biological canonicalization).
+// This implements REVERSE COMPLEMENT canonicalization (biological canonical form).
 //
 // Parameters:
 //   - kmer: the encoded k-mer
 //   - k: the k-mer size (number of nucleotides)
 //
 // Returns:
-//   - the canonical (normalized) k-mer
+//   - the canonical k-mer
-func NormalizeKmer(kmer uint64, k int) uint64 {
+func CanonicalKmer(kmer uint64, k int) uint64 {
 	rc := ReverseComplement(kmer, k)
 	if rc < kmer {
 		return rc
@@ -674,26 +674,26 @@ func NormalizeCircular(kmer uint64, k int) uint64 {
 	return canonical
 }
-// EncodeCircularNormalizedKmer encodes a k-mer and returns its lexicographically
+// EncodeCircularCanonicalKmer encodes a k-mer and returns its lexicographically
 // smallest circular rotation. This is optimized for single k-mer encoding with
-// circular normalization.
+// circular canonicalization.
 //
-// This implements CIRCULAR PERMUTATION normalization, used for entropy-based
+// This implements CIRCULAR PERMUTATION canonicalization, used for entropy-based
-// low-complexity masking. This is DIFFERENT from EncodeNormalizedKmer which
+// low-complexity masking. This is DIFFERENT from EncodeCanonicalKmer which
-// uses reverse complement normalization.
+// uses reverse complement canonicalization.
 //
 // Parameters:
 //   - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
 //   - k: k-mer size (must be between 1 and 31)
 //
 // Returns:
-//   - normalized k-mer as uint64 (smallest circular rotation)
+//   - canonical k-mer as uint64 (smallest circular rotation)
 //   - panics if len(seq) != k or k is invalid
 //
 // Example:
 //
-//	canonical := EncodeCircularNormalizedKmer([]byte("ACGT"), 4)
+//	canonical := EncodeCircularCanonicalKmer([]byte("ACGT"), 4)
-func EncodeCircularNormalizedKmer(seq []byte, k int) uint64 {
+func EncodeCircularCanonicalKmer(seq []byte, k int) uint64 {
 	kmer := EncodeKmer(seq, k)
 	return NormalizeCircular(kmer, k)
 }
@@ -827,7 +827,7 @@ func necklaceCount(n, alphabetSize int) int {
 	return sum / n
 }
-// EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers
+// EncodeCanonicalKmersWithErrors converts a DNA sequence to a slice of canonical k-mers
 // with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V).
 //
 // Ambiguous bases are encoded as 0xFF by __single_base_code__ and detected during
@@ -846,9 +846,9 @@ func necklaceCount(n, alphabetSize int) int {
 //   - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
 //
 // Returns:
-//   - slice of uint64 normalized k-mers with error markers
+//   - slice of uint64 canonical k-mers with error markers
 //   - nil if sequence is shorter than k, k is invalid, or k is even
-func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 {
+func EncodeCanonicalKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 {
 	if k < 1 || k > 31 || k%2 == 0 || len(seq) < k {
 		return nil
 	}
@@ -860,14 +860,14 @@ func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint
 		result = (*buffer)[:0]
 	}
-	for kmer := range IterNormalizedKmersWithErrors(seq, k) {
+	for kmer := range IterCanonicalKmersWithErrors(seq, k) {
 		result = append(result, kmer)
 	}
 	return result
 }
-// EncodeNormalizedKmers converts a DNA sequence to a slice of normalized k-mers.
+// EncodeCanonicalKmers converts a DNA sequence to a slice of canonical k-mers.
 // Each k-mer is replaced by the lexicographically smaller of itself and its
 // reverse complement. This ensures that forward and reverse complement sequences
 // produce the same k-mer set.
@@ -881,9 +881,9 @@ func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint
 //   - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
 //
 // Returns:
-//   - slice of uint64 normalized k-mers
+//   - slice of uint64 canonical k-mers
 //   - nil if sequence is shorter than k or k is invalid
-func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
+func EncodeCanonicalKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
 	if k < 1 || k > 31 || len(seq) < k {
 		return nil
 	}
@@ -895,7 +895,7 @@ func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
 		result = (*buffer)[:0]
 	}
-	for kmer := range IterNormalizedKmers(seq, k) {
+	for kmer := range IterCanonicalKmers(seq, k) {
 		result = append(result, kmer)
 	}
@@ -26,7 +26,7 @@ func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
 // Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
 func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
 	rawSeq := seq.Sequence()
-	for canonical := range IterNormalizedKmers(rawSeq, ff.K()) {
+	for canonical := range IterCanonicalKmers(rawSeq, ff.K()) {
 		ff.AddKmerCode(canonical)
 	}
 }
@@ -45,9 +45,9 @@ func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
 	}
 }
-// AddNormalizedKmerCode ajoute un k-mer encodé normalisé au filtre
+// AddCanonicalKmerCode ajoute un k-mer encodé canonique au filtre
-func (ff *FrequencyFilter) AddNormalizedKmerCode(kmer uint64) {
+func (ff *FrequencyFilter) AddCanonicalKmerCode(kmer uint64) {
-	canonical := NormalizeKmer(kmer, ff.K())
+	canonical := CanonicalKmer(kmer, ff.K())
 	ff.AddKmerCode(canonical)
 }
@@ -59,11 +59,11 @@ func (ff *FrequencyFilter) AddKmer(seq []byte) {
 	ff.AddKmerCode(kmer)
 }
-// AddNormalizedKmer ajoute un k-mer normalisé au filtre en encodant la séquence
+// AddCanonicalKmer ajoute un k-mer canonique au filtre en encodant la séquence
 // La séquence doit avoir exactement k nucléotides
 // Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
-func (ff *FrequencyFilter) AddNormalizedKmer(seq []byte) {
+func (ff *FrequencyFilter) AddCanonicalKmer(seq []byte) {
-	canonical := EncodeNormalizedKmer(seq, ff.K())
+	canonical := EncodeCanonicalKmer(seq, ff.K())
 	ff.AddKmerCode(canonical)
 }
@@ -183,14 +183,14 @@ func (ff *FrequencyFilter) Load(path string) error {
 // Contains vérifie si un k-mer a atteint la fréquence minimale
 func (ff *FrequencyFilter) Contains(kmer uint64) bool {
-	canonical := NormalizeKmer(kmer, ff.K())
+	canonical := CanonicalKmer(kmer, ff.K())
 	return ff.Get(ff.MinFreq - 1).Contains(canonical)
 }
 // GetFrequency retourne la fréquence approximative d'un k-mer
 // Retourne le niveau maximum atteint (freq ≥ niveau)
 func (ff *FrequencyFilter) GetFrequency(kmer uint64) int {
-	canonical := NormalizeKmer(kmer, ff.K())
+	canonical := CanonicalKmer(kmer, ff.K())
 	freq := 0
 	for i := 0; i < ff.MinFreq; i++ {
@@ -44,9 +44,9 @@ func (ks *KmerSet) AddKmerCode(kmer uint64) {
 	ks.bitmap.Add(kmer)
 }
-// AddNormalizedKmerCode ajoute un k-mer encodé normalisé à l'ensemble
+// AddCanonicalKmerCode ajoute un k-mer encodé canonique à l'ensemble
-func (ks *KmerSet) AddNormalizedKmerCode(kmer uint64) {
+func (ks *KmerSet) AddCanonicalKmerCode(kmer uint64) {
-	canonical := NormalizeKmer(kmer, ks.k)
+	canonical := CanonicalKmer(kmer, ks.k)
 	ks.bitmap.Add(canonical)
 }
@@ -58,11 +58,11 @@ func (ks *KmerSet) AddKmer(seq []byte) {
 	ks.bitmap.Add(kmer)
 }
-// AddNormalizedKmer ajoute un k-mer normalisé à l'ensemble en encodant la séquence
+// AddCanonicalKmer ajoute un k-mer canonique à l'ensemble en encodant la séquence
 // La séquence doit avoir exactement k nucléotides
 // Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
-func (ks *KmerSet) AddNormalizedKmer(seq []byte) {
+func (ks *KmerSet) AddCanonicalKmer(seq []byte) {
-	canonical := EncodeNormalizedKmer(seq, ks.k)
+	canonical := EncodeCanonicalKmer(seq, ks.k)
 	ks.bitmap.Add(canonical)
 }
@@ -70,7 +70,7 @@ func (ks *KmerSet) AddNormalizedKmer(seq []byte) {
 // Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
 func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
 	rawSeq := seq.Sequence()
-	for canonical := range IterNormalizedKmers(rawSeq, ks.k) {
+	for canonical := range IterCanonicalKmers(rawSeq, ks.k) {
 		ks.bitmap.Add(canonical)
 	}
 }
@@ -8,7 +8,7 @@ import (
 // corresponds to the last commit, and not the one when the file will be
 // commited
-var _Commit = "6c6c369"
+var _Commit = "16f72e6"
 var _Version = "Release 4.4.0"
 // Version returns the version of the obitools package.