mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 05:20:52 +00:00
Add k-mer encoding and decoding functions with normalized k-mer support
This commit introduces new functions for encoding and decoding k-mers, including support for normalized k-mers. It also updates the frequency filter and k-mer set implementations to use the new encoding functions, providing zero-allocation encoding for better performance. The commit hash has been updated to reflect the latest changes.
This commit is contained in:
@@ -69,6 +69,116 @@ func ClearKmerError(kmer uint64) uint64 {
|
||||
return kmer & KmerSequenceMask
|
||||
}
|
||||
|
||||
// EncodeKmer encodes a single k-mer sequence to uint64.
|
||||
// This is the optimal zero-allocation function for encoding a single k-mer.
|
||||
//
|
||||
// Each nucleotide is encoded on 2 bits according to __single_base_code__:
|
||||
// - A = 0 (00)
|
||||
// - C = 1 (01)
|
||||
// - G = 2 (10)
|
||||
// - T/U = 3 (11)
|
||||
//
|
||||
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
|
||||
// available for error markers if needed.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between 1 and 31)
|
||||
//
|
||||
// Returns:
|
||||
// - encoded k-mer as uint64
|
||||
// - panics if len(seq) != k or k is invalid
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// kmer := EncodeKmer([]byte("ACGT"), 4)
|
||||
func EncodeKmer(seq []byte, k int) uint64 {
|
||||
if k < 1 || k > 31 {
|
||||
panic("k must be between 1 and 31")
|
||||
}
|
||||
if len(seq) != k {
|
||||
panic("sequence length must equal k")
|
||||
}
|
||||
|
||||
var kmer uint64
|
||||
for i := 0; i < k; i++ {
|
||||
kmer <<= 2
|
||||
kmer |= uint64(__single_base_code__[seq[i]&31])
|
||||
}
|
||||
return kmer
|
||||
}
|
||||
|
||||
// EncodeNormalizedKmer encodes a single k-mer sequence to its canonical form (uint64).
|
||||
// Returns the lexicographically smaller of the k-mer and its reverse complement.
|
||||
// This is the optimal zero-allocation function for encoding a single normalized k-mer.
|
||||
//
|
||||
// Parameters:
|
||||
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
|
||||
// - k: k-mer size (must be between 1 and 31)
|
||||
//
|
||||
// Returns:
|
||||
// - normalized k-mer as uint64
|
||||
// - panics if len(seq) != k or k is invalid
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// canonical := EncodeNormalizedKmer([]byte("ACGT"), 4)
|
||||
func EncodeNormalizedKmer(seq []byte, k int) uint64 {
|
||||
if k < 1 || k > 31 {
|
||||
panic("k must be between 1 and 31")
|
||||
}
|
||||
if len(seq) != k {
|
||||
panic("sequence length must equal k")
|
||||
}
|
||||
|
||||
rcShift := uint((k - 1) * 2)
|
||||
|
||||
var fwd, rvc uint64
|
||||
for i := 0; i < k; i++ {
|
||||
code := uint64(__single_base_code__[seq[i]&31])
|
||||
fwd <<= 2
|
||||
fwd |= code
|
||||
rvc >>= 2
|
||||
rvc |= (code ^ 3) << rcShift
|
||||
}
|
||||
|
||||
if fwd <= rvc {
|
||||
return fwd
|
||||
}
|
||||
return rvc
|
||||
}
|
||||
|
||||
// DecodeKmer decodes a uint64 k-mer back to a DNA sequence.
|
||||
// This function reuses a provided buffer to avoid allocation.
|
||||
//
|
||||
// Parameters:
|
||||
// - kmer: encoded k-mer as uint64
|
||||
// - k: k-mer size (number of nucleotides)
|
||||
// - buffer: pre-allocated buffer of length >= k (if nil, allocates new slice)
|
||||
//
|
||||
// Returns:
|
||||
// - decoded DNA sequence as []byte (lowercase acgt)
|
||||
//
|
||||
// Example:
|
||||
//
|
||||
// var buf [32]byte
|
||||
// seq := DecodeKmer(kmer, 21, buf[:])
|
||||
func DecodeKmer(kmer uint64, k int, buffer []byte) []byte {
|
||||
var result []byte
|
||||
if buffer == nil || len(buffer) < k {
|
||||
result = make([]byte, k)
|
||||
} else {
|
||||
result = buffer[:k]
|
||||
}
|
||||
|
||||
bases := [4]byte{'a', 'c', 'g', 't'}
|
||||
for i := k - 1; i >= 0; i-- {
|
||||
result[i] = bases[kmer&3]
|
||||
kmer >>= 2
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// EncodeKmers converts a DNA sequence to a slice of encoded k-mers.
|
||||
// Each nucleotide is encoded on 2 bits according to __single_base_code__:
|
||||
// - A = 0 (00)
|
||||
|
||||
@@ -27,12 +27,12 @@ func NewFrequencyFilter(k, minFreq int) *FrequencyFilter {
|
||||
func (ff *FrequencyFilter) AddSequence(seq *obiseq.BioSequence) {
|
||||
rawSeq := seq.Sequence()
|
||||
for canonical := range IterNormalizedKmers(rawSeq, ff.K()) {
|
||||
ff.addKmer(canonical)
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
}
|
||||
|
||||
// addKmer ajoute un k-mer au filtre (algorithme principal)
|
||||
func (ff *FrequencyFilter) addKmer(kmer uint64) {
|
||||
// AddKmerCode ajoute un k-mer encodé au filtre (algorithme principal)
|
||||
func (ff *FrequencyFilter) AddKmerCode(kmer uint64) {
|
||||
// Trouver le niveau actuel du k-mer
|
||||
c := 0
|
||||
for c < ff.MinFreq && ff.Get(c).Contains(kmer) {
|
||||
@@ -41,10 +41,32 @@ func (ff *FrequencyFilter) addKmer(kmer uint64) {
|
||||
|
||||
// Ajouter au niveau suivant (si pas encore au maximum)
|
||||
if c < ff.MinFreq {
|
||||
ff.Get(c).Add(kmer)
|
||||
ff.Get(c).AddKmerCode(kmer)
|
||||
}
|
||||
}
|
||||
|
||||
// AddNormalizedKmerCode ajoute un k-mer encodé normalisé au filtre
|
||||
func (ff *FrequencyFilter) AddNormalizedKmerCode(kmer uint64) {
|
||||
canonical := NormalizeKmer(kmer, ff.K())
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
|
||||
// AddKmer ajoute un k-mer au filtre en encodant la séquence
|
||||
// La séquence doit avoir exactement k nucléotides
|
||||
// Zero-allocation: encode directement sans créer de slice intermédiaire
|
||||
func (ff *FrequencyFilter) AddKmer(seq []byte) {
|
||||
kmer := EncodeKmer(seq, ff.K())
|
||||
ff.AddKmerCode(kmer)
|
||||
}
|
||||
|
||||
// AddNormalizedKmer ajoute un k-mer normalisé au filtre en encodant la séquence
|
||||
// La séquence doit avoir exactement k nucléotides
|
||||
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
||||
func (ff *FrequencyFilter) AddNormalizedKmer(seq []byte) {
|
||||
canonical := EncodeNormalizedKmer(seq, ff.K())
|
||||
ff.AddKmerCode(canonical)
|
||||
}
|
||||
|
||||
// GetFilteredSet retourne un KmerSet des k-mers avec fréquence ≥ minFreq
|
||||
func (ff *FrequencyFilter) GetFilteredSet() *KmerSet {
|
||||
// Les k-mers filtrés sont dans le dernier niveau
|
||||
|
||||
@@ -39,11 +39,33 @@ func (ks *KmerSet) K() int {
|
||||
return ks.k
|
||||
}
|
||||
|
||||
// Add ajoute un k-mer à l'ensemble
|
||||
func (ks *KmerSet) Add(kmer uint64) {
|
||||
// AddKmerCode ajoute un k-mer encodé à l'ensemble
|
||||
func (ks *KmerSet) AddKmerCode(kmer uint64) {
|
||||
ks.bitmap.Add(kmer)
|
||||
}
|
||||
|
||||
// AddNormalizedKmerCode ajoute un k-mer encodé normalisé à l'ensemble
|
||||
func (ks *KmerSet) AddNormalizedKmerCode(kmer uint64) {
|
||||
canonical := NormalizeKmer(kmer, ks.k)
|
||||
ks.bitmap.Add(canonical)
|
||||
}
|
||||
|
||||
// AddKmer ajoute un k-mer à l'ensemble en encodant la séquence
|
||||
// La séquence doit avoir exactement k nucléotides
|
||||
// Zero-allocation: encode directement sans créer de slice intermédiaire
|
||||
func (ks *KmerSet) AddKmer(seq []byte) {
|
||||
kmer := EncodeKmer(seq, ks.k)
|
||||
ks.bitmap.Add(kmer)
|
||||
}
|
||||
|
||||
// AddNormalizedKmer ajoute un k-mer normalisé à l'ensemble en encodant la séquence
|
||||
// La séquence doit avoir exactement k nucléotides
|
||||
// Zero-allocation: encode directement en forme canonique sans créer de slice intermédiaire
|
||||
func (ks *KmerSet) AddNormalizedKmer(seq []byte) {
|
||||
canonical := EncodeNormalizedKmer(seq, ks.k)
|
||||
ks.bitmap.Add(canonical)
|
||||
}
|
||||
|
||||
// AddSequence ajoute tous les k-mers d'une séquence à l'ensemble
|
||||
// Utilise un itérateur pour éviter l'allocation d'un vecteur intermédiaire
|
||||
func (ks *KmerSet) AddSequence(seq *obiseq.BioSequence) {
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "afcb43b"
|
||||
var _Commit = "c5dd477"
|
||||
var _Version = "Release 4.4.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
||||
Reference in New Issue
Block a user