feat: enforce canonical k-mer representation throughout the codebase
Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
+103
-45
@@ -151,44 +151,9 @@ impl Kmer {
|
||||
/// Return the canonical form: lexicographic minimum of forward and reverse complement.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn canonical(&self, k: usize) -> Self {
|
||||
pub fn canonical(&self, k: usize) -> CanonicalKmer {
|
||||
let rc = self.revcomp(k);
|
||||
if self.0 <= rc.0 { *self } else { rc }
|
||||
}
|
||||
|
||||
/// Return a hash of this kmer.
|
||||
///
|
||||
/// Uses the canonical form of the kmer to compute the hash.
|
||||
#[inline]
|
||||
pub fn seq_hash(&self, k: usize) -> u64 {
|
||||
mix64(self.canonical(k).0)
|
||||
}
|
||||
|
||||
/// Return the left canonical neighbors of this kmer.
|
||||
///
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn left_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
|
||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (2u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (3u64 << 62)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Return the right canonical neighbors of this kmer.
|
||||
///
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn right_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
|
||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (2u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||
]
|
||||
CanonicalKmer(if self.0 <= rc.0 { *self } else { rc })
|
||||
}
|
||||
|
||||
/// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
|
||||
@@ -215,6 +180,99 @@ impl Kmer {
|
||||
}
|
||||
}
|
||||
|
||||
// ── CanonicalKmer ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// A [`Kmer`] guaranteed to be in canonical form (lexicographic minimum of
|
||||
/// forward and reverse complement).
|
||||
///
|
||||
/// The only public constructors are [`Kmer::canonical`] (checked) and
|
||||
/// [`CanonicalKmer::from_raw_unchecked`] (for trusted paths such as
|
||||
/// deserialisation or rolling-window minimizer extraction).
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct CanonicalKmer(Kmer);
|
||||
|
||||
impl CanonicalKmer {
|
||||
/// Wrap a raw left-aligned u64 without verifying the canonical invariant.
|
||||
///
|
||||
/// # Safety (logical)
|
||||
/// The caller must guarantee that `raw == min(raw, revcomp(raw, k))`.
|
||||
/// Violations cause silently wrong results in MPHF lookup and graph traversal.
|
||||
#[inline]
|
||||
pub fn from_raw_unchecked(raw: u64) -> Self {
|
||||
CanonicalKmer(Kmer(raw))
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
self.0.0
|
||||
}
|
||||
|
||||
/// Decode into a freshly allocated ASCII `Vec<u8>`.
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
self.0.to_ascii(k)
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, writing into `writer`.
|
||||
#[inline]
|
||||
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
|
||||
self.0.write_ascii(k, writer)
|
||||
}
|
||||
|
||||
/// Compute the reverse complement. The result is a raw [`Kmer`] — the
|
||||
/// revcomp of a canonical kmer is not necessarily canonical itself.
|
||||
#[inline]
|
||||
pub fn revcomp(&self, k: usize) -> Kmer {
|
||||
self.0.revcomp(k)
|
||||
}
|
||||
|
||||
/// Hash via `mix64`. No re-canonicalisation needed.
|
||||
#[inline]
|
||||
pub fn seq_hash(&self, _k: usize) -> u64 {
|
||||
mix64(self.0.0)
|
||||
}
|
||||
|
||||
/// Return the four left canonical neighbours (each already canonical).
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
||||
let shifted = (self.raw() >> 2) & (!0u64 << (64 - 2 * k));
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (2u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (3u64 << 62)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Return the four right canonical neighbours (each already canonical).
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn right_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
||||
let shifted = self.raw() << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (2u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Consume this wrapper and return the inner raw [`Kmer`].
|
||||
#[inline]
|
||||
pub fn into_kmer(self) -> Kmer {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CanonicalKmer> for Kmer {
|
||||
#[inline]
|
||||
fn from(ck: CanonicalKmer) -> Self {
|
||||
ck.0
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -334,33 +392,33 @@ mod tests {
|
||||
#[test]
|
||||
fn canonical_palindrome() {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), kmer);
|
||||
assert_eq!(kmer.canonical(4).into_kmer(), kmer);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_lesser() {
|
||||
let kmer = Kmer::from_ascii(b"TTTT", 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), expected);
|
||||
assert_eq!(kmer.canonical(4).into_kmer(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = kmer.revcomp(k);
|
||||
assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}");
|
||||
let ck = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = ck.revcomp(k);
|
||||
assert!(ck.raw() <= rc.raw(), "canonical not minimal for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_idempotent() {
|
||||
for &k in K_VALUES {
|
||||
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
let ck = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
assert_eq!(
|
||||
kmer.canonical(k),
|
||||
kmer,
|
||||
ck.into_kmer().canonical(k),
|
||||
ck,
|
||||
"canonical not idempotent for k={k}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -17,5 +17,6 @@ pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use kmer::CanonicalKmer;
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use superkmer::SuperKmer;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
|
||||
|
||||
use super::kmer::Kmer;
|
||||
use super::kmer::CanonicalKmer;
|
||||
use super::SuperKmer;
|
||||
|
||||
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
|
||||
@@ -13,7 +13,7 @@ use super::SuperKmer;
|
||||
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
|
||||
pub struct RoutableSuperKmer {
|
||||
superkmer: SuperKmer,
|
||||
minimizer: Kmer,
|
||||
minimizer: CanonicalKmer,
|
||||
}
|
||||
|
||||
impl RoutableSuperKmer {
|
||||
@@ -43,7 +43,7 @@ impl RoutableSuperKmer {
|
||||
}
|
||||
|
||||
/// Borrow the canonical minimizer kmer.
|
||||
pub fn minimizer(&self) -> &Kmer {
|
||||
pub fn minimizer(&self) -> &CanonicalKmer {
|
||||
&self.minimizer
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io::{self, Write};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
@@ -275,7 +275,7 @@ impl SuperKmer {
|
||||
/// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
|
||||
///
|
||||
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
@@ -312,7 +312,7 @@ impl SuperKmer {
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ fn canonical_kmer_palindrome_unchanged() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
assert_eq!(ck.into_kmer(), fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -99,7 +99,7 @@ fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
assert_eq!(ck.into_kmer(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
|
||||
@@ -198,7 +198,7 @@ impl Unitig {
|
||||
}
|
||||
|
||||
/// Extract the canonical kmer of length `k` starting at position `i`.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
@@ -208,7 +208,7 @@ impl Unitig {
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user