From 6c6c369ee26a9d47dfbe11c62fe14d1800f91b4b Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 5 Feb 2026 15:51:44 +0100 Subject: [PATCH] Add k-mer encoding and decoding functions with normalized k-mer support This commit introduces new functions for encoding and decoding k-mers, including support for normalized k-mers. It also updates the frequency filter and k-mer set implementations to use the new encoding functions, providing zero-allocation encoding for better performance. The commit hash has been updated to reflect the latest changes. --- pkg/obikmer/encodekmer.go | 110 ++++++++++++++++++++++++++++++++ pkg/obikmer/frequency_filter.go | 30 +++++++-- pkg/obikmer/kmer_set.go | 26 +++++++- pkg/obioptions/version.go | 2 +- 4 files changed, 161 insertions(+), 7 deletions(-) diff --git a/pkg/obikmer/encodekmer.go b/pkg/obikmer/encodekmer.go index 4cfa587..756e882 100644 --- a/pkg/obikmer/encodekmer.go +++ b/pkg/obikmer/encodekmer.go @@ -69,6 +69,116 @@ func ClearKmerError(kmer uint64) uint64 { return kmer & KmerSequenceMask } +// EncodeKmer encodes a single k-mer sequence to uint64. +// This is the optimal zero-allocation function for encoding a single k-mer. +// +// Each nucleotide is encoded on 2 bits according to __single_base_code__: +// - A = 0 (00) +// - C = 1 (01) +// - G = 2 (10) +// - T/U = 3 (11) +// +// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits +// available for error markers if needed. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between 1 and 31) +// +// Returns: +// - encoded k-mer as uint64 +// - panics if len(seq) != k or k is invalid +// +// Example: +// +// kmer := EncodeKmer([]byte("ACGT"), 4) +func EncodeKmer(seq []byte, k int) uint64 { + if k < 1 || k > 31 { + panic("k must be between 1 and 31") + } + if len(seq) != k { + panic("sequence length must equal k") + } + + var kmer uint64 + for i := 0; i < k; i++ { + kmer <<= 2 + kmer |= uint64(__single_base_code__[seq[i]&31]) + } + return kmer +} + +// EncodeNormalizedKmer encodes a single k-mer sequence to its canonical form (uint64). +// Returns the lexicographically smaller of the k-mer and its reverse complement. +// This is the optimal zero-allocation function for encoding a single normalized k-mer. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) +// - k: k-mer size (must be between 1 and 31) +// +// Returns: +// - normalized k-mer as uint64 +// - panics if len(seq) != k or k is invalid +// +// Example: +// +// canonical := EncodeNormalizedKmer([]byte("ACGT"), 4) +func EncodeNormalizedKmer(seq []byte, k int) uint64 { + if k < 1 || k > 31 { + panic("k must be between 1 and 31") + } + if len(seq) != k { + panic("sequence length must equal k") + } + + rcShift := uint((k - 1) * 2) + + var fwd, rvc uint64 + for i := 0; i < k; i++ { + code := uint64(__single_base_code__[seq[i]&31]) + fwd <<= 2 + fwd |= code + rvc >>= 2 + rvc |= (code ^ 3) << rcShift + } + + if fwd <= rvc { + return fwd + } + return rvc +} + +// DecodeKmer decodes a uint64 k-mer back to a DNA sequence. +// This function reuses a provided buffer to avoid allocation. +// +// Parameters: +// - kmer: encoded k-mer as uint64 +// - k: k-mer size (number of nucleotides) +// - buffer: pre-allocated buffer of length >= k (if nil, allocates new slice) +// +// Returns: +// - decoded DNA sequence as []byte (lowercase acgt) +// +// Example: +// +// var buf [32]byte +// seq := DecodeKmer(kmer, 21, buf[:]) +func DecodeKmer(kmer uint64, k int, buffer []byte) []byte { + var result []byte + if buffer == nil || len(buffer) < k { + result = make([]byte, k) + } else { + result = buffer[:k] + } + + bases := [4]byte{'a', 'c', 'g', 't'} + for i := k - 1; i >= 0; i-- { + result[i] = bases[kmer&3] + kmer >>= 2 + } + return result +} + // EncodeKmers converts a DNA sequence to a slice of encoded k-mers. // Each nucleotide is encoded on 2 bits according to __single_base_code__: // - A = 0 (00) diff --git a/pkg/obikmer/frequency_filter.go b/pkg/obikmer/frequency_filter.go index 3bc7ddf..83ba616 100644 --- a/pkg/obikmer/frequency_filter.go +++ b/pkg/obikmer/frequency_filter.go @@ -27,12 +27,12 @@ func NewFrequencyFilter(k, minFreq int) *FrequencyFilter { func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) { rawSeq := seq.Sequence() for canonical := range IterNormalizedKmers(rawSeq, ff.K()) { - ff.addKmer(canonical) + ff.AddKmerCode(canonical) } } -// addKmer ajoute un k-mer au filtre (algorithme principal) -func (ff *FrequencyFilter) addKmer(kmer uint64) { +// AddKmerCode ajoute un k-mer encodé au filtre (algorithme principal) +func (ff *FrequencyFilter) AddKmerCode(kmer uint64) { // Trouver le niveau actuel du k-mer c := 0 for c < ff.MinFreq && ff.Get(c).Contains(kmer) { @@ -41,10 +41,32 @@ func (ff *FrequencyFilter) addKmer(kmer uint64) { // Ajouter au niveau suivant (si pas encore au maximum) if c < ff.MinFreq { - ff.Get(c).Add(kmer) + ff.Get(c).AddKmerCode(kmer) } } +// AddNormalizedKmerCode ajoute un k-mer encodé normalisé au filtre +func (ff *FrequencyFilter) AddNormalizedKmerCode(kmer uint64) { + canonical := NormalizeKmer(kmer, ff.K()) + ff.AddKmerCode(canonical) +} + +// AddKmer ajoute un k-mer au filtre en encodant la séquence +// La séquence doit avoir exactement k nucléotides +// Zero-allocation: encode directement sans créer de slice intermédiaire +func (ff *FrequencyFilter) AddKmer(seq []byte) { + kmer := EncodeKmer(seq, ff.K()) + ff.AddKmerCode(kmer) +} + +// AddNormalizedKmer ajoute un k-mer normalisé au filtre en encodant la séquence +// La séquence doit avoir exactement k nucléotides +// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire +func (ff *FrequencyFilter) AddNormalizedKmer(seq []byte) { + canonical := EncodeNormalizedKmer(seq, ff.K()) + ff.AddKmerCode(canonical) +} + // GetFilteredSet retourne un KmerSet des k-mers avec fréquence ≥ minFreq func (ff *FrequencyFilter) GetFilteredSet() *KmerSet { // Les k-mers filtrés sont dans le dernier niveau diff --git a/pkg/obikmer/kmer_set.go b/pkg/obikmer/kmer_set.go index 49bebe7..5832068 100644 --- a/pkg/obikmer/kmer_set.go +++ b/pkg/obikmer/kmer_set.go @@ -39,11 +39,33 @@ func (ks *KmerSet) K() int { return ks.k } -// Add ajoute un k-mer à l'ensemble -func (ks *KmerSet) Add(kmer uint64) { +// AddKmerCode ajoute un k-mer encodé à l'ensemble +func (ks *KmerSet) AddKmerCode(kmer uint64) { ks.bitmap.Add(kmer) } +// AddNormalizedKmerCode ajoute un k-mer encodé normalisé à l'ensemble +func (ks *KmerSet) AddNormalizedKmerCode(kmer uint64) { + canonical := NormalizeKmer(kmer, ks.k) + ks.bitmap.Add(canonical) +} + +// AddKmer ajoute un k-mer à l'ensemble en encodant la séquence +// La séquence doit avoir exactement k nucléotides +// Zero-allocation: encode directement sans créer de slice intermédiaire +func (ks *KmerSet) AddKmer(seq []byte) { + kmer := EncodeKmer(seq, ks.k) + ks.bitmap.Add(kmer) +} + +// AddNormalizedKmer ajoute un k-mer normalisé à l'ensemble en encodant la séquence +// La séquence doit avoir exactement k nucléotides +// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire +func (ks *KmerSet) AddNormalizedKmer(seq []byte) { + canonical := EncodeNormalizedKmer(seq, ks.k) + ks.bitmap.Add(canonical) +} + // AddSequence ajoute tous les k-mers d'une séquence à l'ensemble // Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) { diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 8bacc19..3084368 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "afcb43b" +var _Commit = "c5dd477" var _Version = "Release 4.4.0" // Version returns the version of the obitools package.