From 60f27c1dc87eb2cc799e4c6f963d9af9b6b8c70e Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 4 Feb 2026 21:44:52 +0100 Subject: [PATCH] Add error handling for ambiguous bases in k-mer encoding This commit introduces error handling for ambiguous DNA bases (N, R, Y, W, S, K, M, B, D, H, V) in k-mer encoding. It adds new functions IterNormalizedKmersWithErrors and EncodeNormalizedKmersWithErrors that track and encode the number of ambiguous bases in each k-mer using error markers in the top 2 bits. The commit also updates the version string to reflect the latest changes. --- pkg/obikmer/encodefourmer.go | 4 + pkg/obikmer/encodekmer.go | 285 +++++++++++++++++++++++++++++++++++ pkg/obioptions/version.go | 2 +- 3 files changed, 290 insertions(+), 1 deletion(-) diff --git a/pkg/obikmer/encodefourmer.go b/pkg/obikmer/encodefourmer.go index 42d9326..c5adbd7 100644 --- a/pkg/obikmer/encodefourmer.go +++ b/pkg/obikmer/encodefourmer.go @@ -5,6 +5,10 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" ) +// __single_base_code__ encodes DNA bases to 2-bit values. +// Standard bases: A=0, C=1, G=2, T/U=3 +// Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V): 0xFF (255) to signal error + var __single_base_code__ = []byte{0, // A, B, C, D, 0, 0, 1, 0, diff --git a/pkg/obikmer/encodekmer.go b/pkg/obikmer/encodekmer.go index 765b691..fa03f43 100644 --- a/pkg/obikmer/encodekmer.go +++ b/pkg/obikmer/encodekmer.go @@ -2,6 +2,27 @@ package obikmer import "iter" +var __single_base_code_err__ = []byte{0, + // A, B, C, D, + 0, 0xFF, 1, 0xFF, + // E, F, G, H, + 0xFF, 0xFF, 2, 0xFF, + // I, J, K, L, + 0xFF, 0xFF, 0xFF, 0xFF, + // M, N, O, P, + 0xFF, 0xFF, 0xFF, 0xFF, + // Q, R, S, T, + 0xFF, 0xFF, 0xFF, 3, + // U, V, W, X, + 3, 0xFF, 0xFF, 0xFF, + // Y, Z, ., ., + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, +} + +const ambiguousBaseCode = byte(0xFF) + + // Error markers for k-mers of odd length ≤ 31 // For odd k ≤ 31, only k*2 bits are used (max 62 bits), leaving 2 high bits // available for error coding in the top 2 bits (bits 62-63). @@ -153,6 +174,137 @@ func IterKmers(seq []byte, k int) iter.Seq[uint64] { } } +// IterNormalizedKmersWithErrors returns an iterator over all normalized k-mers +// with error markers for ambiguous bases. No intermediate slice is allocated. +// +// Ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V) are encoded as 0xFF and detected +// during k-mer construction. The error code in bits 62-63 indicates the number of +// ambiguous bases in each k-mer (0=clean, 1-3=error count). +// +// Only valid for odd k ≤ 31 where 2 bits remain unused for error markers. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U, and ambiguous bases) +// - k: k-mer size (must be odd, between 1 and 31) +// +// Returns: +// - iterator yielding uint64 normalized k-mers with error markers +// +// Example: +// for kmer := range IterNormalizedKmersWithErrors(seq, 21) { +// if GetKmerError(kmer) == 0 { +// bitmap.Add(kmer) // Only add clean k-mers +// } +// } +func IterNormalizedKmersWithErrors(seq []byte, k int) iter.Seq[uint64] { + return func(yield func(uint64) bool) { + // Only valid for odd k ≤ 31 + if k < 1 || k > 31 || k%2 == 0 || len(seq) < k { + return + } + + // Mask to keep only k*2 bits + mask := uint64(1)<<(k*2) - 1 + + // Shift amount for adding to reverse complement (high position) + rcShift := uint((k - 1) * 2) + + // Track ambiguous base count in sliding window + ambiguousCount := 0 + const ambiguousCode = byte(0xFF) + + // Build the first k-mer (forward and reverse complement) + var fwd, rvc uint64 + hasError := false + for i := 0; i < k; i++ { + code := __single_base_code_err__[seq[i]&31] + + // Check for ambiguous base + if code == ambiguousCode { + ambiguousCount++ + hasError = true + code = 0 // Encode as A for the sequence bits + } + + codeUint := uint64(code) + // Forward: shift left and add new code at low end + fwd <<= 2 + fwd |= codeUint + // Reverse complement: shift right and add complement at high end + rvc >>= 2 + rvc |= (codeUint ^ 3) << rcShift + } + + // Yield normalized k-mer with error marker + var canonical uint64 + if fwd <= rvc { + canonical = fwd + } else { + canonical = rvc + } + + // Set error code based on ambiguous count + if hasError { + errorCode := uint64(ambiguousCount) + if errorCode > 3 { + errorCode = 3 + } + canonical = SetKmerError(canonical, errorCode) + } + + if !yield(canonical) { + return + } + + // Slide through the rest of the sequence + for i := k; i < len(seq); i++ { + // Check outgoing base (position i-k) + outgoingCode := __single_base_code__[seq[i-k]&31] + if outgoingCode == ambiguousCode { + ambiguousCount-- + } + + // Check incoming base (position i) + code := __single_base_code__[seq[i]&31] + if code == ambiguousCode { + ambiguousCount++ + code = 0 // Encode as A for the sequence bits + } + + codeUint := uint64(code) + + // Update forward k-mer: shift left, add new code, mask + fwd <<= 2 + fwd |= codeUint + fwd &= mask + + // Update reverse complement: shift right, add complement at high end + rvc >>= 2 + rvc |= (codeUint ^ 3) << rcShift + + // Yield normalized k-mer + if fwd <= rvc { + canonical = fwd + } else { + canonical = rvc + } + + // Set error code based on ambiguous count + if ambiguousCount > 0 { + errorCode := uint64(ambiguousCount) + if errorCode > 3 { + errorCode = 3 + } + canonical = SetKmerError(canonical, errorCode) + } + + if !yield(canonical) { + return + } + } + } +} + // IterNormalizedKmers returns an iterator over all normalized (canonical) k-mers. // No intermediate slice is allocated, making it memory-efficient. // @@ -447,6 +599,139 @@ func NormalizeKmer(kmer uint64, k int) uint64 { return kmer } +// EncodeNormalizedKmersWithErrors converts a DNA sequence to a slice of normalized k-mers +// with error markers for ambiguous bases (N, R, Y, W, S, K, M, B, D, H, V). +// +// Ambiguous bases are encoded as 0xFF by __single_base_code__ and detected during +// k-mer construction. The error code in bits 62-63 indicates the number of ambiguous +// bases in each k-mer: +// - errorCode 0: no ambiguous bases (clean k-mer) +// - errorCode 1: 1 ambiguous base +// - errorCode 2: 2 ambiguous bases +// - errorCode 3: 3 or more ambiguous bases +// +// Only valid for odd k ≤ 31 where 2 bits remain unused for error markers. +// +// Parameters: +// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U, and ambiguous bases) +// - k: k-mer size (must be odd, between 1 and 31) +// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. +// +// Returns: +// - slice of uint64 normalized k-mers with error markers +// - nil if sequence is shorter than k, k is invalid, or k is even +func EncodeNormalizedKmersWithErrors(seq []byte, k int, buffer *[]uint64) []uint64 { + // Only valid for odd k ≤ 31 + if k < 1 || k > 31 || k%2 == 0 || len(seq) < k { + return nil + } + + n := len(seq) - k + 1 + + var result []uint64 + if buffer == nil { + result = make([]uint64, 0, n) + } else { + result = (*buffer)[:0] + } + + // Mask to keep only k*2 bits + mask := uint64(1)<<(k*2) - 1 + + // Shift amount for adding to reverse complement (high position) + rcShift := uint((k - 1) * 2) + + // Track ambiguous base count in sliding window + ambiguousCount := 0 + const ambiguousCode = byte(0xFF) + + // Build the first k-mer (forward and reverse complement) + var fwd, rvc uint64 + hasError := false + for i := 0; i < k; i++ { + code := __single_base_code_err__[seq[i]&31] + + // Check for ambiguous base + if code == ambiguousCode { + ambiguousCount++ + hasError = true + code = 0 // Encode as A for the sequence bits + } + + codeUint := uint64(code) + // Forward: shift left and add new code at low end + fwd <<= 2 + fwd |= codeUint + // Reverse complement: shift right and add complement at high end + rvc >>= 2 + rvc |= (codeUint ^ 3) << rcShift + } + + // Store the normalized (canonical) k-mer with error marker + var canonical uint64 + if fwd <= rvc { + canonical = fwd + } else { + canonical = rvc + } + + // Set error code based on ambiguous count + if hasError { + errorCode := uint64(ambiguousCount) + if errorCode > 3 { + errorCode = 3 + } + canonical = SetKmerError(canonical, errorCode) + } + result = append(result, canonical) + + // Slide through the rest of the sequence + for i := k; i < len(seq); i++ { + // Check outgoing base (position i-k) + outgoingCode := __single_base_code__[seq[i-k]&31] + if outgoingCode == ambiguousCode { + ambiguousCount-- + } + + // Check incoming base (position i) + code := __single_base_code__[seq[i]&31] + if code == ambiguousCode { + ambiguousCount++ + code = 0 // Encode as A for the sequence bits + } + + codeUint := uint64(code) + + // Update forward k-mer: shift left, add new code, mask + fwd <<= 2 + fwd |= codeUint + fwd &= mask + + // Update reverse complement: shift right, add complement at high end + rvc >>= 2 + rvc |= (codeUint ^ 3) << rcShift + + // Store the normalized k-mer + if fwd <= rvc { + canonical = fwd + } else { + canonical = rvc + } + + // Set error code based on ambiguous count + if ambiguousCount > 0 { + errorCode := uint64(ambiguousCount) + if errorCode > 3 { + errorCode = 3 + } + canonical = SetKmerError(canonical, errorCode) + } + result = append(result, canonical) + } + + return result +} + // EncodeNormalizedKmers converts a DNA sequence to a slice of normalized k-mers. // Each k-mer is replaced by the lexicographically smaller of itself and its // reverse complement. This ensures that forward and reverse complement sequences diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 7e2201e..d8e3a4b 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "52244cd" +var _Commit = "28162ac" var _Version = "Release 4.4.0" // Version returns the version of the obitools package.