feat: enforce canonical k-mer representation throughout the codebase

Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
Eric Coissac
2026-05-01 13:34:55 +02:00
parent 21ddbf1674
commit defeeb9460
12 changed files with 235 additions and 113 deletions
+103 -45
View File
@@ -151,44 +151,9 @@ impl Kmer {
/// Return the canonical form: lexicographic minimum of forward and reverse complement.
/// Zero allocation — result lives on the stack.
#[inline]
pub fn canonical(&self, k: usize) -> Self {
pub fn canonical(&self, k: usize) -> CanonicalKmer {
let rc = self.revcomp(k);
if self.0 <= rc.0 { *self } else { rc }
}
/// Return a hash of this kmer.
///
/// Uses the canonical form of the kmer to compute the hash.
#[inline]
pub fn seq_hash(&self, k: usize) -> u64 {
mix64(self.canonical(k).0)
}
/// Return the left canonical neighbors of this kmer.
///
/// Zero allocation — result lives on the stack.
pub fn left_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
[
Kmer(shifted).canonical(k),
Kmer(shifted | (1u64 << 62)).canonical(k),
Kmer(shifted | (2u64 << 62)).canonical(k),
Kmer(shifted | (3u64 << 62)).canonical(k),
]
}
/// Return the right canonical neighbors of this kmer.
///
/// Zero allocation — result lives on the stack.
pub fn right_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
let shift = 64 - 2 * k;
[
Kmer(shifted).canonical(k),
Kmer(shifted | (1u64 << shift)).canonical(k),
Kmer(shifted | (2u64 << shift)).canonical(k),
Kmer(shifted | (3u64 << shift)).canonical(k),
]
CanonicalKmer(if self.0 <= rc.0 { *self } else { rc })
}
/// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
@@ -215,6 +180,99 @@ impl Kmer {
}
}
// ── CanonicalKmer ─────────────────────────────────────────────────────────────
/// A [`Kmer`] guaranteed to be in canonical form (lexicographic minimum of
/// forward and reverse complement).
///
/// The only public constructors are [`Kmer::canonical`] (checked) and
/// [`CanonicalKmer::from_raw_unchecked`] (for trusted paths such as
/// deserialisation or rolling-window minimizer extraction).
#[repr(transparent)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct CanonicalKmer(Kmer);
impl CanonicalKmer {
/// Wrap a raw left-aligned u64 without verifying the canonical invariant.
///
/// # Safety (logical)
/// The caller must guarantee that `raw == min(raw, revcomp(raw, k))`.
/// Violations cause silently wrong results in MPHF lookup and graph traversal.
#[inline]
pub fn from_raw_unchecked(raw: u64) -> Self {
CanonicalKmer(Kmer(raw))
}
/// Return the raw left-aligned u64 value.
#[inline]
pub fn raw(&self) -> u64 {
self.0.0
}
/// Decode into a freshly allocated ASCII `Vec<u8>`.
#[inline]
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
self.0.to_ascii(k)
}
/// Decode into ASCII nucleotides, writing into `writer`.
#[inline]
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
self.0.write_ascii(k, writer)
}
/// Compute the reverse complement. The result is a raw [`Kmer`] — the
/// revcomp of a canonical kmer is not necessarily canonical itself.
#[inline]
pub fn revcomp(&self, k: usize) -> Kmer {
self.0.revcomp(k)
}
/// Hash via `mix64`. No re-canonicalisation needed.
#[inline]
pub fn seq_hash(&self, _k: usize) -> u64 {
mix64(self.0.0)
}
/// Return the four left canonical neighbours (each already canonical).
/// Zero allocation — result lives on the stack.
pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
let shifted = (self.raw() >> 2) & (!0u64 << (64 - 2 * k));
[
Kmer(shifted).canonical(k),
Kmer(shifted | (1u64 << 62)).canonical(k),
Kmer(shifted | (2u64 << 62)).canonical(k),
Kmer(shifted | (3u64 << 62)).canonical(k),
]
}
/// Return the four right canonical neighbours (each already canonical).
/// Zero allocation — result lives on the stack.
pub fn right_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
let shifted = self.raw() << 2 & (!0u64 << (64 - 2 * (k - 1)));
let shift = 64 - 2 * k;
[
Kmer(shifted).canonical(k),
Kmer(shifted | (1u64 << shift)).canonical(k),
Kmer(shifted | (2u64 << shift)).canonical(k),
Kmer(shifted | (3u64 << shift)).canonical(k),
]
}
/// Consume this wrapper and return the inner raw [`Kmer`].
#[inline]
pub fn into_kmer(self) -> Kmer {
self.0
}
}
impl From<CanonicalKmer> for Kmer {
#[inline]
fn from(ck: CanonicalKmer) -> Self {
ck.0
}
}
// ── tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
@@ -334,33 +392,33 @@ mod tests {
#[test]
fn canonical_palindrome() {
let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
assert_eq!(kmer.canonical(4), kmer);
assert_eq!(kmer.canonical(4).into_kmer(), kmer);
}
#[test]
fn canonical_chooses_lesser() {
let kmer = Kmer::from_ascii(b"TTTT", 4).unwrap();
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
assert_eq!(kmer.canonical(4), expected);
assert_eq!(kmer.canonical(4).into_kmer(), expected);
}
#[test]
fn canonical_is_minimal() {
for &k in K_VALUES {
let ascii = make_seq(k);
let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
let rc = kmer.revcomp(k);
assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}");
let ck = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
let rc = ck.revcomp(k);
assert!(ck.raw() <= rc.raw(), "canonical not minimal for k={k}");
}
}
#[test]
fn canonical_idempotent() {
for &k in K_VALUES {
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
let ck = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
assert_eq!(
kmer.canonical(k),
kmer,
ck.into_kmer().canonical(k),
ck,
"canonical not idempotent for k={k}"
);
}
+1
View File
@@ -17,5 +17,6 @@ pub mod superkmer;
pub mod unitig;
pub use annotations::Annotation;
pub use kmer::CanonicalKmer;
pub use routable::RoutableSuperKmer;
pub use superkmer::SuperKmer;
+3 -3
View File
@@ -1,6 +1,6 @@
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
use super::kmer::Kmer;
use super::kmer::CanonicalKmer;
use super::SuperKmer;
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
@@ -13,7 +13,7 @@ use super::SuperKmer;
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
pub struct RoutableSuperKmer {
superkmer: SuperKmer,
minimizer: Kmer,
minimizer: CanonicalKmer,
}
impl RoutableSuperKmer {
@@ -43,7 +43,7 @@ impl RoutableSuperKmer {
}
/// Borrow the canonical minimizer kmer.
pub fn minimizer(&self) -> &Kmer {
pub fn minimizer(&self) -> &CanonicalKmer {
&self.minimizer
}
+3 -3
View File
@@ -4,7 +4,7 @@ use std::io::{self, Write};
use serde::Serialize;
use crate::encoding::{DEC4, encode_base};
use crate::kmer::{Kmer, KmerError};
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
use crate::revcomp_lookup::REVCOMP4;
use bitvec::prelude::*;
use xxhash_rust::xxh3::xxh3_64;
@@ -275,7 +275,7 @@ impl SuperKmer {
/// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
///
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
Ok(self.kmer(i, k)?.canonical(k))
}
@@ -312,7 +312,7 @@ impl SuperKmer {
}
/// Iterate over all canonical kmers of length `k` in order.
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
self.iter_kmers(k).map(move |km| km.canonical(k))
}
+2 -2
View File
@@ -91,7 +91,7 @@ fn canonical_kmer_palindrome_unchanged() {
let sk = SuperKmer::from_ascii(b"ACGT");
let ck = sk.canonical_kmer(0, 4).unwrap();
let fwd = sk.kmer(0, 4).unwrap();
assert_eq!(ck, fwd);
assert_eq!(ck.into_kmer(), fwd);
}
#[test]
@@ -99,7 +99,7 @@ fn canonical_kmer_tttt_becomes_aaaa() {
let sk = SuperKmer::from_ascii(b"TTTT");
let ck = sk.canonical_kmer(0, 4).unwrap();
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
assert_eq!(ck, expected);
assert_eq!(ck.into_kmer(), expected);
}
#[test]
+3 -3
View File
@@ -7,7 +7,7 @@
use std::io::{self, Write};
use crate::encoding::{DEC4, encode_base};
use crate::kmer::{Kmer, KmerError};
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
use crate::revcomp_lookup::REVCOMP4;
use bitvec::prelude::*;
@@ -198,7 +198,7 @@ impl Unitig {
}
/// Extract the canonical kmer of length `k` starting at position `i`.
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
Ok(self.kmer(i, k)?.canonical(k))
}
@@ -208,7 +208,7 @@ impl Unitig {
}
/// Iterate over all canonical kmers of length `k` in order.
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
self.iter_kmers(k).map(move |km| km.canonical(k))
}
}