Add jj Makefile targets and k-mer encoding utilities

Add new Makefile targets for jj operations (jjnew, jjpush, jjfetch) to streamline commit workflow.

Introduce k-mer encoding utilities in pkg/obikmer:
- EncodeKmers: converts DNA sequences to encoded k-mers
- ReverseComplement: computes reverse complement of k-mers
- NormalizeKmer: returns canonical form of k-mers
- EncodeNormalizedKmers: encodes sequences with normalized k-mers

Add comprehensive tests for k-mer encoding functions including edge cases, buffer reuse, and performance benchmarks.

Document k-mer index design for large genomes, covering:
- Use cases and objectives
- Volume estimations
- Distance metrics (Jaccard, Sørensen-Dice, Bray-Curtis)
- Indexing options (Bloom filters, sorted sets, MPHF)
- Optimization techniques (k-2-mer indexing)
- MinHash for distance acceleration
- Recommended architecture for presence/absence and counting queries
This commit is contained in:
Eric Coissac
2026-01-25 18:43:30 +01:00
parent 740f66b4c7
commit 500144051a
4 changed files with 937 additions and 1 deletions

183
pkg/obikmer/encodekmer.go Normal file
View File

@@ -0,0 +1,183 @@
package obikmer
// EncodeKmers converts a DNA sequence to a slice of encoded k-mers.
// Each nucleotide is encoded on 2 bits according to __single_base_code__:
// - A = 0 (00)
// - C = 1 (01)
// - G = 2 (10)
// - T/U = 3 (11)
//
// The function returns overlapping k-mers of size k encoded as uint64.
// For a sequence of length n, it returns n-k+1 k-mers.
//
// Parameters:
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
// - k: k-mer size (must be between 1 and 32)
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
//
// Returns:
// - slice of uint64 encoded k-mers
// - nil if sequence is shorter than k or k is invalid
func EncodeKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
if k < 1 || k > 32 || len(seq) < k {
return nil
}
n := len(seq) - k + 1
var result []uint64
if buffer == nil {
result = make([]uint64, 0, n)
} else {
result = (*buffer)[:0]
}
// Mask to keep only k*2 bits
mask := uint64(1)<<(k*2) - 1
// Build the first k-mer
var kmer uint64
for i := 0; i < k; i++ {
kmer <<= 2
kmer |= uint64(__single_base_code__[seq[i]&31])
}
result = append(result, kmer)
// Slide through the rest of the sequence
for i := k; i < len(seq); i++ {
kmer <<= 2
kmer |= uint64(__single_base_code__[seq[i]&31])
kmer &= mask
result = append(result, kmer)
}
return result
}
// ReverseComplement computes the reverse complement of an encoded k-mer.
// The k-mer is encoded with 2 bits per nucleotide (A=00, C=01, G=10, T=11).
// The complement is: A↔T (00↔11), C↔G (01↔10), which is simply XOR with 11.
// The reverse swaps the order of 2-bit pairs.
//
// Parameters:
// - kmer: the encoded k-mer
// - k: the k-mer size (number of nucleotides)
//
// Returns:
// - the reverse complement of the k-mer
func ReverseComplement(kmer uint64, k int) uint64 {
// Step 1: Complement - XOR with all 1s to flip A↔T and C↔G
// For a k-mer of size k, we only want to flip the lower k*2 bits
mask := uint64(1)<<(k*2) - 1
rc := (^kmer) & mask
// Step 2: Reverse the order of 2-bit pairs
// We use a series of swaps at increasing granularity
rc = ((rc & 0x3333333333333333) << 2) | ((rc & 0xCCCCCCCCCCCCCCCC) >> 2) // Swap adjacent pairs
rc = ((rc & 0x0F0F0F0F0F0F0F0F) << 4) | ((rc & 0xF0F0F0F0F0F0F0F0) >> 4) // Swap nibbles
rc = ((rc & 0x00FF00FF00FF00FF) << 8) | ((rc & 0xFF00FF00FF00FF00) >> 8) // Swap bytes
rc = ((rc & 0x0000FFFF0000FFFF) << 16) | ((rc & 0xFFFF0000FFFF0000) >> 16) // Swap 16-bit words
rc = (rc << 32) | (rc >> 32) // Swap 32-bit words
// Step 3: Shift right to align the k-mer (we reversed all 32 pairs, need only k)
rc >>= (64 - k*2)
return rc
}
// NormalizeKmer returns the lexicographically smaller of a k-mer and its
// reverse complement. This canonical form ensures that a k-mer and its
// reverse complement map to the same value.
//
// Parameters:
// - kmer: the encoded k-mer
// - k: the k-mer size (number of nucleotides)
//
// Returns:
// - the canonical (normalized) k-mer
func NormalizeKmer(kmer uint64, k int) uint64 {
rc := ReverseComplement(kmer, k)
if rc < kmer {
return rc
}
return kmer
}
// EncodeNormalizedKmers converts a DNA sequence to a slice of normalized k-mers.
// Each k-mer is replaced by the lexicographically smaller of itself and its
// reverse complement. This ensures that forward and reverse complement sequences
// produce the same k-mer set.
//
// Parameters:
// - seq: DNA sequence as a byte slice (case insensitive, supports A, C, G, T, U)
// - k: k-mer size (must be between 1 and 32)
// - buffer: optional pre-allocated buffer for results. If nil, a new slice is created.
//
// Returns:
// - slice of uint64 normalized k-mers
// - nil if sequence is shorter than k or k is invalid
func EncodeNormalizedKmers(seq []byte, k int, buffer *[]uint64) []uint64 {
if k < 1 || k > 32 || len(seq) < k {
return nil
}
n := len(seq) - k + 1
var result []uint64
if buffer == nil {
result = make([]uint64, 0, n)
} else {
result = (*buffer)[:0]
}
// Mask to keep only k*2 bits
mask := uint64(1)<<(k*2) - 1
// Shift amount for adding to reverse complement (high position)
rcShift := uint((k - 1) * 2)
// Complement lookup: A(00)->T(11), C(01)->G(10), G(10)->C(01), T(11)->A(00)
// This is simply XOR with 3
// Build the first k-mer (forward and reverse complement)
var fwd, rvc uint64
for i := 0; i < k; i++ {
code := uint64(__single_base_code__[seq[i]&31])
// Forward: shift left and add new code at low end
fwd <<= 2
fwd |= code
// Reverse complement: shift right and add complement at high end
rvc >>= 2
rvc |= (code ^ 3) << rcShift
}
// Store the normalized (canonical) k-mer
if fwd <= rvc {
result = append(result, fwd)
} else {
result = append(result, rvc)
}
// Slide through the rest of the sequence
for i := k; i < len(seq); i++ {
code := uint64(__single_base_code__[seq[i]&31])
// Update forward k-mer: shift left, add new code, mask
fwd <<= 2
fwd |= code
fwd &= mask
// Update reverse complement: shift right, add complement at high end
rvc >>= 2
rvc |= (code ^ 3) << rcShift
// Store the normalized k-mer
if fwd <= rvc {
result = append(result, fwd)
} else {
result = append(result, rvc)
}
}
return result
}

View File

@@ -0,0 +1,518 @@
package obikmer
import (
"bytes"
"testing"
)
// TestEncodeKmersBasic tests basic k-mer encoding
func TestEncodeKmersBasic(t *testing.T) {
tests := []struct {
name string
seq string
k int
expected []uint64
}{
{
name: "simple 4-mer ACGT",
seq: "ACGT",
k: 4,
expected: []uint64{0b00011011}, // A=00, C=01, G=10, T=11 -> 00 01 10 11 = 27
},
{
name: "simple 2-mer AC",
seq: "AC",
k: 2,
expected: []uint64{0b0001}, // A=00, C=01 -> 00 01 = 1
},
{
name: "sliding 2-mer ACGT",
seq: "ACGT",
k: 2,
expected: []uint64{0b0001, 0b0110, 0b1011}, // AC=1, CG=6, GT=11
},
{
name: "lowercase",
seq: "acgt",
k: 4,
expected: []uint64{0b00011011},
},
{
name: "with U instead of T",
seq: "ACGU",
k: 4,
expected: []uint64{0b00011011}, // U encodes same as T
},
{
name: "8-mer",
seq: "ACGTACGT",
k: 8,
expected: []uint64{0b0001101100011011}, // ACGTACGT
},
{
name: "32-mer max size",
seq: "ACGTACGTACGTACGTACGTACGTACGTACGT",
k: 32,
expected: []uint64{0x1B1B1B1B1B1B1B1B}, // ACGTACGT repeated 4 times
},
{
name: "longer sequence sliding",
seq: "AAACCCGGG",
k: 3,
expected: []uint64{
0b000000, // AAA = 0
0b000001, // AAC = 1
0b000101, // ACC = 5
0b010101, // CCC = 21
0b010110, // CCG = 22
0b011010, // CGG = 26
0b101010, // GGG = 42
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := EncodeKmers([]byte(tt.seq), tt.k, nil)
if len(result) != len(tt.expected) {
t.Errorf("length mismatch: got %d, want %d", len(result), len(tt.expected))
return
}
for i, v := range result {
if v != tt.expected[i] {
t.Errorf("position %d: got %d (0b%b), want %d (0b%b)",
i, v, v, tt.expected[i], tt.expected[i])
}
}
})
}
}
// TestEncodeKmersEdgeCases tests edge cases
func TestEncodeKmersEdgeCases(t *testing.T) {
// Empty sequence
result := EncodeKmers([]byte{}, 4, nil)
if result != nil {
t.Errorf("empty sequence should return nil, got %v", result)
}
// k > sequence length
result = EncodeKmers([]byte("ACG"), 4, nil)
if result != nil {
t.Errorf("k > seq length should return nil, got %v", result)
}
// k = 0
result = EncodeKmers([]byte("ACGT"), 0, nil)
if result != nil {
t.Errorf("k=0 should return nil, got %v", result)
}
// k > 32
result = EncodeKmers([]byte("ACGTACGTACGTACGTACGTACGTACGTACGTACGT"), 33, nil)
if result != nil {
t.Errorf("k>32 should return nil, got %v", result)
}
// k = sequence length (single k-mer)
result = EncodeKmers([]byte("ACGT"), 4, nil)
if len(result) != 1 {
t.Errorf("k=seq_len should return 1 k-mer, got %d", len(result))
}
}
// TestEncodeKmersBuffer tests buffer reuse
func TestEncodeKmersBuffer(t *testing.T) {
seq := []byte("ACGTACGTACGT")
k := 4
// First call without buffer
result1 := EncodeKmers(seq, k, nil)
// Second call with buffer - pre-allocate with capacity
buffer := make([]uint64, 0, 100)
result2 := EncodeKmers(seq, k, &buffer)
if len(result1) != len(result2) {
t.Errorf("buffer reuse: length mismatch %d vs %d", len(result1), len(result2))
}
for i := range result1 {
if result1[i] != result2[i] {
t.Errorf("buffer reuse: position %d mismatch", i)
}
}
// Verify results are correct
if len(result2) == 0 {
t.Errorf("result should not be empty")
}
// Test multiple calls with same buffer to verify no memory issues
for i := 0; i < 10; i++ {
result3 := EncodeKmers(seq, k, &buffer)
if len(result3) != len(result1) {
t.Errorf("iteration %d: length mismatch", i)
}
}
}
// TestEncodeKmersVariousLengths tests encoding with various sequence lengths
func TestEncodeKmersVariousLengths(t *testing.T) {
lengths := []int{1, 4, 8, 15, 16, 17, 31, 32, 33, 63, 64, 65, 100, 256, 1000}
k := 8
for _, length := range lengths {
// Generate test sequence
seq := make([]byte, length)
for i := range seq {
seq[i] = "ACGT"[i%4]
}
if length < k {
continue
}
t.Run("length_"+string(rune('0'+length/100))+string(rune('0'+(length%100)/10))+string(rune('0'+length%10)), func(t *testing.T) {
result := EncodeKmers(seq, k, nil)
expectedLen := length - k + 1
if len(result) != expectedLen {
t.Errorf("length mismatch: got %d, want %d", len(result), expectedLen)
}
})
}
}
// TestEncodeKmersLongSequence tests with a longer realistic sequence
func TestEncodeKmersLongSequence(t *testing.T) {
// Simulate a realistic DNA sequence
seq := bytes.Repeat([]byte("ACGTACGTNNACGTACGT"), 100)
k := 16
result := EncodeKmers(seq, k, nil)
expectedLen := len(seq) - k + 1
if len(result) != expectedLen {
t.Fatalf("length mismatch: got %d, want %d", len(result), expectedLen)
}
}
// BenchmarkEncodeKmers benchmarks the encoding function
func BenchmarkEncodeKmers(b *testing.B) {
// Create test sequences of various sizes
sizes := []int{100, 1000, 10000, 100000}
kSizes := []int{8, 16, 32}
for _, k := range kSizes {
for _, size := range sizes {
seq := make([]byte, size)
for i := range seq {
seq[i] = "ACGT"[i%4]
}
name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10))
b.Run(name, func(b *testing.B) {
buffer := make([]uint64, 0, size)
b.ResetTimer()
b.SetBytes(int64(size))
for i := 0; i < b.N; i++ {
EncodeKmers(seq, k, &buffer)
}
})
}
}
}
// TestEncodeNucleotide verifies nucleotide encoding
func TestEncodeNucleotide(t *testing.T) {
testCases := []struct {
nucleotide byte
expected byte
}{
{'A', 0},
{'a', 0},
{'C', 1},
{'c', 1},
{'G', 2},
{'g', 2},
{'T', 3},
{'t', 3},
{'U', 3},
{'u', 3},
}
for _, tc := range testCases {
result := EncodeNucleotide(tc.nucleotide)
if result != tc.expected {
t.Errorf("EncodeNucleotide('%c') = %d, want %d",
tc.nucleotide, result, tc.expected)
}
}
}
// TestReverseComplement tests the reverse complement function
func TestReverseComplement(t *testing.T) {
tests := []struct {
name string
seq string
k int
expected string // expected reverse complement sequence
}{
{
name: "ACGT -> ACGT (palindrome)",
seq: "ACGT",
k: 4,
expected: "ACGT",
},
{
name: "AAAA -> TTTT",
seq: "AAAA",
k: 4,
expected: "TTTT",
},
{
name: "TTTT -> AAAA",
seq: "TTTT",
k: 4,
expected: "AAAA",
},
{
name: "CCCC -> GGGG",
seq: "CCCC",
k: 4,
expected: "GGGG",
},
{
name: "AACG -> CGTT",
seq: "AACG",
k: 4,
expected: "CGTT",
},
{
name: "AC -> GT",
seq: "AC",
k: 2,
expected: "GT",
},
{
name: "ACGTACGT -> ACGTACGT (palindrome)",
seq: "ACGTACGT",
k: 8,
expected: "ACGTACGT",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Encode the input sequence
kmers := EncodeKmers([]byte(tt.seq), tt.k, nil)
if len(kmers) != 1 {
t.Fatalf("expected 1 k-mer, got %d", len(kmers))
}
// Compute reverse complement
rc := ReverseComplement(kmers[0], tt.k)
// Encode the expected reverse complement
expectedKmers := EncodeKmers([]byte(tt.expected), tt.k, nil)
if len(expectedKmers) != 1 {
t.Fatalf("expected 1 k-mer for expected, got %d", len(expectedKmers))
}
if rc != expectedKmers[0] {
t.Errorf("ReverseComplement(%s) = %d (0b%b), want %d (0b%b) for %s",
tt.seq, rc, rc, expectedKmers[0], expectedKmers[0], tt.expected)
}
})
}
}
// TestReverseComplementInvolution tests that RC(RC(x)) = x
func TestReverseComplementInvolution(t *testing.T) {
testSeqs := []string{"ACGT", "AAAA", "TTTT", "ACGTACGT", "AACGTTGC", "AC", "ACGTACGTACGTACGT", "ACGTACGTACGTACGTACGTACGTACGTACGT"}
for _, seq := range testSeqs {
k := len(seq)
kmers := EncodeKmers([]byte(seq), k, nil)
if len(kmers) != 1 {
continue
}
original := kmers[0]
rc := ReverseComplement(original, k)
rcrc := ReverseComplement(rc, k)
if rcrc != original {
t.Errorf("RC(RC(%s)) != %s: got %d, want %d", seq, seq, rcrc, original)
}
}
}
// TestNormalizeKmer tests the normalization function
func TestNormalizeKmer(t *testing.T) {
tests := []struct {
name string
seq string
k int
}{
{"ACGT palindrome", "ACGT", 4},
{"AAAA vs TTTT", "AAAA", 4},
{"TTTT vs AAAA", "TTTT", 4},
{"AACG vs CGTT", "AACG", 4},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
kmers := EncodeKmers([]byte(tt.seq), tt.k, nil)
if len(kmers) != 1 {
t.Fatalf("expected 1 k-mer, got %d", len(kmers))
}
kmer := kmers[0]
rc := ReverseComplement(kmer, tt.k)
normalized := NormalizeKmer(kmer, tt.k)
// Normalized should be the minimum
expectedNorm := kmer
if rc < kmer {
expectedNorm = rc
}
if normalized != expectedNorm {
t.Errorf("NormalizeKmer(%d) = %d, want %d", kmer, normalized, expectedNorm)
}
// Normalizing the RC should give the same result
normalizedRC := NormalizeKmer(rc, tt.k)
if normalizedRC != normalized {
t.Errorf("NormalizeKmer(RC) = %d, want %d (same as NormalizeKmer(fwd))", normalizedRC, normalized)
}
})
}
}
// TestEncodeNormalizedKmersBasic tests basic normalized k-mer encoding
func TestEncodeNormalizedKmersBasic(t *testing.T) {
// Test that a sequence and its reverse complement produce the same normalized k-mers
seq := []byte("AACGTT")
revComp := []byte("AACGTT") // This is a palindrome!
k := 4
kmers1 := EncodeNormalizedKmers(seq, k, nil)
kmers2 := EncodeNormalizedKmers(revComp, k, nil)
if len(kmers1) != len(kmers2) {
t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
}
// For a palindrome, forward and reverse should give the same k-mers
for i := range kmers1 {
if kmers1[i] != kmers2[len(kmers2)-1-i] {
t.Logf("Note: position %d differs (expected for non-palindromic sequences)", i)
}
}
}
// TestEncodeNormalizedKmersSymmetry tests that seq and its RC produce same normalized k-mers (reversed)
func TestEncodeNormalizedKmersSymmetry(t *testing.T) {
// Manually construct a sequence and its reverse complement
seq := []byte("ACGTAACCGG")
// Compute reverse complement manually
rcMap := map[byte]byte{'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
revComp := make([]byte, len(seq))
for i, b := range seq {
revComp[len(seq)-1-i] = rcMap[b]
}
k := 4
kmers1 := EncodeNormalizedKmers(seq, k, nil)
kmers2 := EncodeNormalizedKmers(revComp, k, nil)
if len(kmers1) != len(kmers2) {
t.Fatalf("length mismatch: %d vs %d", len(kmers1), len(kmers2))
}
// The normalized k-mers should be the same but in reverse order
for i := range kmers1 {
j := len(kmers2) - 1 - i
if kmers1[i] != kmers2[j] {
t.Errorf("position %d vs %d: %d != %d", i, j, kmers1[i], kmers2[j])
}
}
}
// TestEncodeNormalizedKmersConsistency verifies normalized k-mers match manual normalization
func TestEncodeNormalizedKmersConsistency(t *testing.T) {
seq := []byte("ACGTACGTACGTACGT")
k := 8
// Get k-mers both ways
rawKmers := EncodeKmers(seq, k, nil)
normalizedKmers := EncodeNormalizedKmers(seq, k, nil)
if len(rawKmers) != len(normalizedKmers) {
t.Fatalf("length mismatch: %d vs %d", len(rawKmers), len(normalizedKmers))
}
// Verify each normalized k-mer matches manual normalization
for i, raw := range rawKmers {
expected := NormalizeKmer(raw, k)
if normalizedKmers[i] != expected {
t.Errorf("position %d: EncodeNormalizedKmers gave %d, NormalizeKmer gave %d",
i, normalizedKmers[i], expected)
}
}
}
// BenchmarkEncodeNormalizedKmers benchmarks the normalized encoding function
func BenchmarkEncodeNormalizedKmers(b *testing.B) {
sizes := []int{100, 1000, 10000, 100000}
kSizes := []int{8, 16, 32}
for _, k := range kSizes {
for _, size := range sizes {
seq := make([]byte, size)
for i := range seq {
seq[i] = "ACGT"[i%4]
}
name := "k" + string(rune('0'+k/10)) + string(rune('0'+k%10)) + "_size" + string(rune('0'+size/10000)) + string(rune('0'+(size%10000)/1000)) + string(rune('0'+(size%1000)/100)) + string(rune('0'+(size%100)/10)) + string(rune('0'+size%10))
b.Run(name, func(b *testing.B) {
buffer := make([]uint64, 0, size)
b.ResetTimer()
b.SetBytes(int64(size))
for i := 0; i < b.N; i++ {
EncodeNormalizedKmers(seq, k, &buffer)
}
})
}
}
}
// BenchmarkReverseComplement benchmarks the reverse complement function
func BenchmarkReverseComplement(b *testing.B) {
kmer := uint64(0x123456789ABCDEF0)
k := 32
b.ResetTimer()
for i := 0; i < b.N; i++ {
ReverseComplement(kmer, k)
}
}
// BenchmarkNormalizeKmer benchmarks the normalization function
func BenchmarkNormalizeKmer(b *testing.B) {
kmer := uint64(0x123456789ABCDEF0)
k := 32
b.ResetTimer()
for i := 0; i < b.N; i++ {
NormalizeKmer(kmer, k)
}
}