Add error marker support for k-mers with enhanced documentation

This commit introduces error marker functionality for k-mers with odd lengths up to 31. The top 2 bits of each k-mer are now reserved for error coding (0-3), allowing for error detection and correction capabilities. Key changes include:

- Added constants KmerErrorMask and KmerSequenceMask for bit manipulation
- Implemented SetKmerError, GetKmerError, and ClearKmerError functions
- Updated EncodeKmers, ExtractSuperKmers, EncodeNormalizedKmers functions to enforce k ≤ 31
- Enhanced ReverseComplement to preserve error bits during reverse complement operations
- Added comprehensive tests for error marker functionality including edge cases and integration tests

The maximum k-mer size is now capped at 31 to accommodate the error bits, ensuring that k-mers with odd lengths ≤ 31 utilize only 62 bits of the 64-bit uint64, leaving the top 2 bits available for error coding.
This commit is contained in:
Eric Coissac
2026-02-04 16:13:13 +01:00
parent 05de9ca58e
commit 1a1adb83ac
2 changed files with 354 additions and 23 deletions

View File

@@ -1,5 +1,51 @@
package obikmer package obikmer
// Error markers for k-mers of odd length ≤ 31
// For odd k ≤ 31, only k*2 bits are used (max 62 bits), leaving 2 high bits
// available for error coding in the top 2 bits (bits 62-63).
//
// Error codes are simple integers:
// 0 = no error
// 1 = error type 1
// 2 = error type 2
// 3 = error type 3
//
// Use SetKmerError(kmer, code) and GetKmerError(kmer) to manipulate error bits.
const (
KmerErrorMask uint64 = 0b11 << 62 // Mask to extract error bits (bits 62-63)
KmerSequenceMask uint64 = ^KmerErrorMask // Mask to extract sequence bits (bits 0-61)
)
// SetKmerError sets the error marker bits on a k-mer encoded value.
// Only valid for odd k-mer sizes ≤ 31 where 2 bits remain unused.
//
// Parameters:
// - kmer: the encoded k-mer value
// - errorCode: error code (0-3), where 0=no error, 1-3=error types
//
// Returns:
// - k-mer with error bits set
func SetKmerError(kmer uint64, errorCode uint64) uint64 {
return (kmer & KmerSequenceMask) | ((errorCode & 0b11) << 62)
}
// GetKmerError extracts the error marker bits from a k-mer encoded value.
//
// Returns:
// - error code (0-3) as raw value (not shifted)
func GetKmerError(kmer uint64) uint64 {
return (kmer & KmerErrorMask) >> 62
}
// ClearKmerError removes the error marker bits from a k-mer, returning
// just the sequence encoding.
//
// Returns:
// - k-mer with error bits cleared (set to 00)
func ClearKmerError(kmer uint64) uint64 {
return kmer & KmerSequenceMask
}
// EncodeKmers converts a DNA sequence to a slice of encoded k-mers. // EncodeKmers converts a DNA sequence to a slice of encoded k-mers.
// Each nucleotide is encoded on 2 bits according to __single_base_code__: // Each nucleotide is encoded on 2 bits according to __single_base_code__:
// - A = 0 (00) // - A = 0 (00)
@@ -10,16 +56,19 @@ package obikmer
// The function returns overlapping k-mers of size k encoded as uint64. // The function returns overlapping k-mers of size k encoded as uint64.
// For a sequence of length n, it returns n-k+1 k-mers. // For a sequence of length n, it returns n-k+1 k-mers.
// //
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
// available for error markers (see SetKmerError).
//
// Parameters: // Parameters:
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) // - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
// - k: k-mer size (must be between 1 and 32) // - k: k-mer size (must be between 1 and 31)
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. // - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
// //
// Returns: // Returns:
// - slice of uint64 encoded k-mers // - slice of uint64 encoded k-mers
// - nil if sequence is shorter than k or k is invalid // - nil if sequence is shorter than k or k is invalid
func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 { func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
if k < 1 || k > 32 || len(seq) < k { if k < 1 || k > 31 || len(seq) < k {
return nil return nil
} }
@@ -80,9 +129,12 @@ type dequeItem struct {
// - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation // - Simultaneous forward/reverse m-mer encoding for O(1) canonical m-mer computation
// - Monotone deque for O(1) amortized minimizer tracking per position // - Monotone deque for O(1) amortized minimizer tracking per position
// //
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
// available for error markers if needed.
//
// Parameters: // Parameters:
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) // - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
// - k: k-mer size (must be between m+1 and 32) // - k: k-mer size (must be between m+1 and 31)
// - m: minimizer size (must be between 1 and k-1) // - m: minimizer size (must be between 1 and k-1)
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. // - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
// //
@@ -94,7 +146,7 @@ type dequeItem struct {
// Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results // Space complexity: O(k-m+1) for the deque + O(number of super k-mers) for results
func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer { func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKmer {
// Validate parameters // Validate parameters
if m < 1 || m >= k || k < 2 || k > 32 || len(seq) < k { if m < 1 || m >= k || k < 2 || k > 31 || len(seq) < k {
return nil return nil
} }
@@ -215,13 +267,19 @@ func ExtractSuperKmers(seq []byte, k int, m int, buffer *[]SuperKmer) []SuperKme
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11. // The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
// The reverse swaps the order of 2-bit pairs. // The reverse swaps the order of 2-bit pairs.
// //
// For k-mers with error markers (top 2 bits), the error bits are preserved
// and transferred to the reverse complement.
//
// Parameters: // Parameters:
// - kmer: the encoded k-mer // - kmer: the encoded k-mer (possibly with error bits in positions 62-63)
// - k: the k-mer size (number of nucleotides) // - k: the k-mer size (number of nucleotides)
// //
// Returns: // Returns:
// - the reverse complement of the k-mer // - the reverse complement of the k-mer with error bits preserved
func ReverseComplement(kmer uint64, k int) uint64 { func ReverseComplement(kmer uint64, k int) uint64 {
// Step 0: Extract and preserve error bits
errorBits := kmer & KmerErrorMask
// Step 1: Complement - XOR with all 1s to flip A↔T and C↔G // Step 1: Complement - XOR with all 1s to flip A↔T and C↔G
// For a k-mer of size k, we only want to flip the lower k*2 bits // For a k-mer of size k, we only want to flip the lower k*2 bits
mask := uint64(1)<<(k*2) - 1 mask := uint64(1)<<(k*2) - 1
@@ -238,6 +296,9 @@ func ReverseComplement(kmer uint64, k int) uint64 {
// Step 3: Shift right to align the k-mer (we reversed all 32 pairs, need only k) // Step 3: Shift right to align the k-mer (we reversed all 32 pairs, need only k)
rc >>= (64 - k*2) rc >>= (64 - k*2)
// Step 4: Restore error bits
rc |= errorBits
return rc return rc
} }
@@ -264,16 +325,19 @@ func NormalizeKmer(kmer uint64, k int) uint64 {
// reverse complement. This ensures that forward and reverse complement sequences // reverse complement. This ensures that forward and reverse complement sequences
// produce the same k-mer set. // produce the same k-mer set.
// //
// The maximum k-mer size is 31 (using 62 bits), leaving the top 2 bits
// available for error markers (see SetKmerError).
//
// Parameters: // Parameters:
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U) // - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
// - k: k-mer size (must be between 1 and 32) // - k: k-mer size (must be between 1 and 31)
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created. // - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
// //
// Returns: // Returns:
// - slice of uint64 normalized k-mers // - slice of uint64 normalized k-mers
// - nil if sequence is shorter than k or k is invalid // - nil if sequence is shorter than k or k is invalid
func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 { func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
if k < 1 || k > 32 || len(seq) < k { if k < 1 || k > 31 || len(seq) < k {
return nil return nil
} }

View File

@@ -50,10 +50,10 @@ func TestEncodeKmersBasic(t *testing.T) {
expected: []uint64{0b0001101100011011}, // ACGTACGT expected: []uint64{0b0001101100011011}, // ACGTACGT
}, },
{ {
name: "32-mer max size", name: "31-mer max size",
seq: "ACGTACGTACGTACGTACGTACGTACGTACGT", seq: "ACGTACGTACGTACGTACGTACGTACGTACG",
k: 32, k: 31,
expected: []uint64{0x1B1B1B1B1B1B1B1B}, // ACGTACGT repeated 4 times expected: []uint64{0x06C6C6C6C6C6C6C6}, // ACGTACGT repeated ~4 times
}, },
{ {
name: "longer sequence sliding", name: "longer sequence sliding",
@@ -110,10 +110,10 @@ func TestEncodeKmersEdgeCases(t *testing.T) {
t.Errorf("k=0 should return nil, got %v", result) t.Errorf("k=0 should return nil, got %v", result)
} }
// k > 32 // k > 31
result = EncodeKmers([]byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGT"), 33, nil) result = EncodeKmers([]byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGT"), 32, nil)
if result != nil { if result != nil {
t.Errorf("k>32 should return nil, got %v", result) t.Errorf("k>31 should return nil, got %v", result)
} }
// k = sequence length (single k-mer) // k = sequence length (single k-mer)
@@ -204,7 +204,7 @@ func TestEncodeKmersLongSequence(t *testing.T) {
func BenchmarkEncodeKmers(b *testing.B) { func BenchmarkEncodeKmers(b *testing.B) {
// Create test sequences of various sizes // Create test sequences of various sizes
sizes := []int{100, 1000, 10000, 100000} sizes := []int{100, 1000, 10000, 100000}
kSizes := []int{8, 16, 32} kSizes := []int{8, 16, 31}
for _, k := range kSizes { for _, k := range kSizes {
for _, size := range sizes { for _, size := range sizes {
@@ -472,7 +472,7 @@ func TestEncodeNormalizedKmersConsistency(t *testing.T) {
// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function // BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function
func BenchmarkEncodeNormalizedKmers(b *testing.B) { func BenchmarkEncodeNormalizedKmers(b *testing.B) {
sizes := []int{100, 1000, 10000, 100000} sizes := []int{100, 1000, 10000, 100000}
kSizes := []int{8, 16, 32} kSizes := []int{8, 16, 31}
for _, k := range kSizes { for _, k := range kSizes {
for _, size := range sizes { for _, size := range sizes {
@@ -497,8 +497,8 @@ func BenchmarkEncodeNormalizedKmers(b *testing.B) {
// BenchmarkReverseComplement benchmarks the reverse complement function // BenchmarkReverseComplement benchmarks the reverse complement function
func BenchmarkReverseComplement(b *testing.B) { func BenchmarkReverseComplement(b *testing.B) {
kmer := uint64(0x123456789ABCDEF0) kmer := uint64(0x06C6C6C6C6C6C6C6)
k := 32 k := 31
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
@@ -508,8 +508,8 @@ func BenchmarkReverseComplement(b *testing.B) {
// BenchmarkNormalizeKmer benchmarks the normalization function // BenchmarkNormalizeKmer benchmarks the normalization function
func BenchmarkNormalizeKmer(b *testing.B) { func BenchmarkNormalizeKmer(b *testing.B) {
kmer := uint64(0x123456789ABCDEF0) kmer := uint64(0x06C6C6C6C6C6C6C6)
k := 32 k := 31
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
@@ -607,8 +607,8 @@ func TestExtractSuperKmersEdgeCases(t *testing.T) {
{"m >= k", "ACGTACGT", 5, 5, true}, {"m >= k", "ACGTACGT", 5, 5, true},
{"m == k-1 (valid)", "ACGTACGT", 5, 4, false}, {"m == k-1 (valid)", "ACGTACGT", 5, 4, false},
{"k < 2", "ACGTACGT", 1, 1, true}, {"k < 2", "ACGTACGT", 1, 1, true},
{"k > 32", "ACGTACGTACGTACGTACGTACGTACGTACGTACGT", 33, 16, true}, {"k > 31", "ACGTACGTACGTACGTACGTACGTACGTACGT", 32, 16, true},
{"k == 32 (valid)", "ACGTACGTACGTACGTACGTACGTACGTACGT", 32, 16, false}, {"k == 31 (valid)", "ACGTACGTACGTACGTACGTACGTACGTACG", 31, 16, false},
{"seq == k (valid)", "ACGTACGT", 8, 4, false}, {"seq == k (valid)", "ACGTACGT", 8, 4, false},
} }
@@ -789,6 +789,273 @@ func TestExtractSuperKmersVariousKM(t *testing.T) {
} }
} }
// TestKmerErrorMarkers tests the error marker functionality
func TestKmerErrorMarkers(t *testing.T) {
// Test with a 31-mer (max odd k-mer that fits in 62 bits)
kmer31 := uint64(0x1FFFFFFFFFFFFFFF) // All 62 bits set (31 * 2)
tests := []struct {
name string
kmer uint64
errorCode uint64
expected uint64
}{
{
name: "no error",
kmer: kmer31,
errorCode: 0,
expected: kmer31,
},
{
name: "error type 1",
kmer: kmer31,
errorCode: 1,
expected: kmer31 | (0b01 << 62),
},
{
name: "error type 2",
kmer: kmer31,
errorCode: 2,
expected: kmer31 | (0b10 << 62),
},
{
name: "error type 3",
kmer: kmer31,
errorCode: 3,
expected: kmer31 | (0b11 << 62),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Set error
marked := SetKmerError(tt.kmer, tt.errorCode)
if marked != tt.expected {
t.Errorf("SetKmerError: got 0x%016X, want 0x%016X", marked, tt.expected)
}
// Get error
extractedError := GetKmerError(marked)
if extractedError != tt.errorCode {
t.Errorf("GetKmerError: got 0x%016X, want 0x%016X", extractedError, tt.errorCode)
}
// Clear error
cleared := ClearKmerError(marked)
if cleared != tt.kmer {
t.Errorf("ClearKmerError: got 0x%016X, want 0x%016X", cleared, tt.kmer)
}
// Verify sequence bits are preserved
if (marked & KmerSequenceMask) != tt.kmer {
t.Errorf("Sequence bits corrupted: got 0x%016X, want 0x%016X",
marked&KmerSequenceMask, tt.kmer)
}
})
}
}
// TestKmerErrorMarkersWithRealKmers tests error markers with actual k-mer encoding
func TestKmerErrorMarkersWithRealKmers(t *testing.T) {
// Encode a real 31-mer
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG") // 31 bases
k := 31
kmers := EncodeKmers(seq, k, nil)
if len(kmers) != 1 {
t.Fatalf("Expected 1 k-mer, got %d", len(kmers))
}
originalKmer := kmers[0]
// Test each error type
for i, errorCode := range []uint64{0, 1, 2, 3} {
t.Run("error_code_"+string(rune('0'+i)), func(t *testing.T) {
// Mark with error
marked := SetKmerError(originalKmer, errorCode)
// Verify error can be extracted
if GetKmerError(marked) != errorCode {
t.Errorf("Error code mismatch: got 0x%X, want 0x%X",
GetKmerError(marked), errorCode)
}
// Verify sequence is preserved
if ClearKmerError(marked) != originalKmer {
t.Errorf("Original k-mer not preserved after marking")
}
// Verify normalization works with error bits cleared
normalized1 := NormalizeKmer(originalKmer, k)
normalized2 := NormalizeKmer(ClearKmerError(marked), k)
if normalized1 != normalized2 {
t.Errorf("Normalization affected by error bits")
}
})
}
}
// TestKmerErrorMarkersConstants verifies the mask constant definitions
func TestKmerErrorMarkersConstants(t *testing.T) {
// Verify error mask covers exactly the top 2 bits
if KmerErrorMask != (0b11 << 62) {
t.Errorf("KmerErrorMask wrong value: 0x%016X", KmerErrorMask)
}
// Verify sequence mask is the complement
if KmerSequenceMask != ^KmerErrorMask {
t.Errorf("KmerSequenceMask should be complement of KmerErrorMask")
}
// Verify masks are mutually exclusive
if (KmerErrorMask & KmerSequenceMask) != 0 {
t.Errorf("Masks should be mutually exclusive")
}
// Verify masks cover all bits
if (KmerErrorMask | KmerSequenceMask) != ^uint64(0) {
t.Errorf("Masks should cover all 64 bits")
}
// Verify error code API
testKmer := uint64(0x06C6C6C6C6C6C6C6)
for code := uint64(0); code <= 3; code++ {
marked := SetKmerError(testKmer, code)
extracted := GetKmerError(marked)
if extracted != code {
t.Errorf("Error code %d not preserved: got %d", code, extracted)
}
}
}
// TestReverseComplementPreservesErrorBits tests that RC preserves error markers
func TestReverseComplementPreservesErrorBits(t *testing.T) {
// Test with a 31-mer
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
k := 31
kmers := EncodeKmers(seq, k, nil)
if len(kmers) != 1 {
t.Fatalf("Expected 1 k-mer, got %d", len(kmers))
}
originalKmer := kmers[0]
// Test each error code
errorCodes := []uint64{0, 1, 2, 3}
for i, errCode := range errorCodes {
t.Run("error_code_"+string(rune('0'+i)), func(t *testing.T) {
// Mark k-mer with error
marked := SetKmerError(originalKmer, errCode)
// Compute reverse complement
rc := ReverseComplement(marked, k)
// Verify error bits are preserved
extractedError := GetKmerError(rc)
if extractedError != errCode {
t.Errorf("Error bits not preserved: got 0x%X, want 0x%X", extractedError, errCode)
}
// Verify sequence was reverse complemented correctly
// (clear error bits and check RC property)
cleanOriginal := ClearKmerError(originalKmer)
cleanRC := ClearKmerError(rc)
expectedRC := ReverseComplement(cleanOriginal, k)
if cleanRC != expectedRC {
t.Errorf("Sequence not reverse complemented correctly")
}
// Verify RC(RC(x)) = x (involution property with error bits)
rcrc := ReverseComplement(rc, k)
if rcrc != marked {
t.Errorf("RC(RC(x)) != x: got 0x%016X, want 0x%016X", rcrc, marked)
}
})
}
}
// TestNormalizeKmerWithErrorBits tests that NormalizeKmer works with error bits
func TestNormalizeKmerWithErrorBits(t *testing.T) {
seq := []byte("ACGTACGTACGTACGTACGTACGTACGTACG")
k := 31
kmers := EncodeKmers(seq, k, nil)
if len(kmers) != 1 {
t.Fatalf("Expected 1 k-mer, got %d", len(kmers))
}
originalKmer := kmers[0]
// Test with different error codes
for i, errCode := range []uint64{1, 2, 3} {
t.Run("error_code_"+string(rune('0'+i+1)), func(t *testing.T) {
marked := SetKmerError(originalKmer, errCode)
// Normalize should work on the sequence part
normalized := NormalizeKmer(marked, k)
// Error bits should be preserved
if GetKmerError(normalized) != errCode {
t.Errorf("Error bits lost during normalization")
}
// The sequence part should be normalized
cleanNormalized := ClearKmerError(normalized)
expectedNormalized := NormalizeKmer(ClearKmerError(marked), k)
if cleanNormalized != expectedNormalized {
t.Errorf("Normalization incorrect with error bits present")
}
})
}
}
// TestKmerErrorMarkersOddKmers tests that error markers work for all odd k ≤ 31
func TestKmerErrorMarkersOddKmers(t *testing.T) {
oddKSizes := []int{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}
for _, k := range oddKSizes {
t.Run("k="+string(rune('0'+k/10))+string(rune('0'+k%10)), func(t *testing.T) {
// Create a sequence of length k
seq := make([]byte, k)
for i := range seq {
seq[i] = "ACGT"[i%4]
}
kmers := EncodeKmers(seq, k, nil)
if len(kmers) != 1 {
t.Fatalf("Expected 1 k-mer, got %d", len(kmers))
}
originalKmer := kmers[0]
// Verify that k*2 bits fit in 62 bits (top 2 bits should be free)
maxValue := uint64(1)<<(k*2) - 1
if originalKmer > maxValue {
t.Errorf("k-mer exceeds expected bit range for k=%d", k)
}
// Test all error codes
for _, errCode := range []uint64{1, 2, 3} {
marked := SetKmerError(originalKmer, errCode)
// Verify error is set
if GetKmerError(marked) != errCode {
t.Errorf("Error code not preserved for k=%d", k)
}
// Verify sequence is preserved
if ClearKmerError(marked) != originalKmer {
t.Errorf("Sequence corrupted for k=%d", k)
}
}
})
}
}
// BenchmarkExtractSuperKmers benchmarks the super k-mer extraction // BenchmarkExtractSuperKmers benchmarks the super k-mer extraction
func BenchmarkExtractSuperKmers(b *testing.B) { func BenchmarkExtractSuperKmers(b *testing.B) {
sizes := []int{100, 1000, 10000, 100000} sizes := []int{100, 1000, 10000, 100000}