feat: enforce canonical k-mer representation throughout the codebase

Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
Eric Coissac
2026-05-01 13:34:55 +02:00
parent 21ddbf1674
commit defeeb9460
12 changed files with 235 additions and 113 deletions
+60 -40
View File
@@ -1,7 +1,7 @@
use ahash::RandomState;
use hashbrown::HashMap;
use obifastwrite::write_unitig;
use obikseq::kmer::Kmer;
use obikseq::kmer::{CanonicalKmer, Kmer};
use obikseq::unitig::Unitig;
use std::cell::Cell;
use std::io;
@@ -76,7 +76,7 @@ impl Node {
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
pub struct GraphDeBruijn {
nodes: FastHashMap<Kmer, Cell<Node>>,
nodes: FastHashMap<CanonicalKmer, Cell<Node>>,
k: usize,
}
@@ -95,10 +95,10 @@ impl GraphDeBruijn {
}
}
/// Insert `kmer` (canonicalised) into the graph. No-op if already present.
pub fn push(&mut self, kmer: Kmer) {
/// Insert a canonical kmer into the graph. No-op if already present.
pub fn push(&mut self, kmer: CanonicalKmer) {
self.nodes
.entry(kmer.canonical(self.k))
.entry(kmer)
.or_insert_with(|| Cell::new(Node::default()));
}
@@ -108,8 +108,8 @@ impl GraphDeBruijn {
/// Single pass thanks to Cell interior mutability.
pub fn compute_degrees(&self) {
for (&kmer, cell) in &self.nodes {
let right_nuc = unique_neighbor(&kmer.right_canonical_neighbors(self.k), &self.nodes);
let left_nuc = unique_neighbor(&kmer.left_canonical_neighbors(self.k), &self.nodes);
let right_nuc = unique_neighbor(kmer.right_canonical_neighbors(self.k), &self.nodes);
let left_nuc = unique_neighbor(kmer.left_canonical_neighbors(self.k), &self.nodes);
let mut node = cell.get();
node.set_right(right_nuc);
@@ -136,8 +136,8 @@ impl GraphDeBruijn {
if node.is_visited() {
return None;
}
let start = if !node.can_extend_left() {
kmer
let start: Kmer = if !node.can_extend_left() {
kmer.into_kmer()
} else if !node.can_extend_right() {
kmer.revcomp(k)
} else {
@@ -159,7 +159,7 @@ impl GraphDeBruijn {
let mut updated = node;
updated.set_visited();
cell.set(updated);
Some(kmer)
Some(kmer.into_kmer())
});
chain_starts.chain(cycle_starts)
@@ -174,16 +174,21 @@ impl GraphDeBruijn {
///
/// The returned kmer is oriented so that its first k-1 bases match
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
pub fn next_kmer(&self, kmer: Kmer) -> Option<Kmer> {
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
let canonical = kmer.canonical(self.k);
let node = self.nodes.get(&canonical)?.get();
let next = if kmer == canonical {
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
return None;
}
let next: CanonicalKmer = if kmer.raw() == canonical.raw() {
if !node.can_extend_right() {
return None;
}
// push_right gives the raw right extension (non-canonical) that properly extends kmer
// push_right gives the raw right extension that properly extends kmer
canonical
.into_kmer()
.push_right(node.right_nuc(), self.k)
.canonical(self.k)
} else {
@@ -192,6 +197,7 @@ impl GraphDeBruijn {
}
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
canonical
.into_kmer()
.push_left(node.left_nuc(), self.k)
.canonical(self.k)
};
@@ -204,8 +210,8 @@ impl GraphDeBruijn {
return None;
}
let oriented = if kmer.is_overlapping(next, self.k) {
next
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) {
next.into_kmer()
} else {
next.revcomp(self.k)
};
@@ -227,10 +233,7 @@ impl GraphDeBruijn {
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
UnitigIter {
graph: self,
current: Some(start),
}
UnitigIter::new(self, start)
}
/// Iterate over all unitigs in the graph.
@@ -275,6 +278,17 @@ impl GraphDeBruijn {
pub struct UnitigIter<'a> {
graph: &'a GraphDeBruijn,
current: Option<Kmer>,
at_start: bool,
}
impl<'a> UnitigIter<'a> {
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
Self {
graph,
current: Some(start),
at_start: true,
}
}
}
impl<'a> Iterator for UnitigIter<'a> {
@@ -282,8 +296,9 @@ impl<'a> Iterator for UnitigIter<'a> {
fn next(&mut self) -> Option<Kmer> {
let current = self.current?;
// next_kmer handles visited marking and cycle detection
self.current = self.graph.next_kmer(current);
// next_unitig_kmer handles visited marking and cycle detection
self.current = self.graph.next_unitig_kmer(current, self.at_start);
self.at_start = false;
Some(current)
}
}
@@ -293,7 +308,10 @@ impl<'a> Iterator for UnitigIter<'a> {
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
/// zero or ≥2 existing neighbours.
fn unique_neighbor(neighbors: &[Kmer; 4], nodes: &FastHashMap<Kmer, Cell<Node>>) -> Option<u8> {
fn unique_neighbor(
neighbors: [CanonicalKmer; 4],
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
) -> Option<u8> {
let mut found: Option<u8> = None;
for (i, neighbour) in neighbors.iter().enumerate() {
if nodes.contains_key(neighbour) {
@@ -316,15 +334,14 @@ mod tests {
fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn {
let mut g = GraphDeBruijn::new(k);
for i in 0..=seq.len().saturating_sub(k) {
let kmer = Kmer::from_ascii(&seq[i..i + k], k).unwrap();
g.push(kmer);
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
}
g
}
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<Kmer> {
let mut v: Vec<Kmer> = (0..=seq.len().saturating_sub(k))
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<CanonicalKmer> {
let mut v: Vec<CanonicalKmer> = (0..=seq.len().saturating_sub(k))
.map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k))
.collect();
v.sort_unstable();
@@ -338,10 +355,9 @@ mod tests {
fn push_deduplicates_revcomp() {
let k = 5;
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
let rc = kmer.revcomp(k);
let mut g = GraphDeBruijn::new(k);
g.push(kmer);
g.push(rc);
g.push(kmer.canonical(k));
g.push(kmer.revcomp(k).canonical(k));
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
}
@@ -352,7 +368,7 @@ mod tests {
let kmer = Kmer::from_ascii(b"ACGT", k).unwrap();
assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome");
let mut g = GraphDeBruijn::new(k);
g.push(kmer);
g.push(kmer.canonical(k));
assert_eq!(g.len(), 1);
}
@@ -360,7 +376,7 @@ mod tests {
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<Kmer>) {
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<CanonicalKmer>) {
let seq = b"AAAAGGGG";
let g = graph_from_ascii(seq, k);
let kmers = canonical_kmers(seq, k);
@@ -386,7 +402,11 @@ mod tests {
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
assert_eq!(unitigs[0].seql(), seq.len(), "unitig spans the full sequence");
assert_eq!(
unitigs[0].seql(),
seq.len(),
"unitig spans the full sequence"
);
assert_eq!(
kmers_from_unitigs(&unitigs, k),
canonical_kmers(seq, k),
@@ -397,8 +417,8 @@ mod tests {
// ── unitig reconstruction ─────────────────────────────────────────────────
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<Kmer> {
let mut v: Vec<Kmer> = unitigs
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<CanonicalKmer> {
let mut v: Vec<CanonicalKmer> = unitigs
.iter()
.flat_map(|u| u.iter_canonical_kmers(k))
.collect();
@@ -446,7 +466,7 @@ mod tests {
let k = 5;
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
let mut g = GraphDeBruijn::new(k);
g.push(kmer);
g.push(kmer.canonical(k));
g.compute_degrees();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
assert_eq!(unitigs.len(), 1);
@@ -458,9 +478,9 @@ mod tests {
let k = 5;
let mut g = GraphDeBruijn::new(k);
// Two k-mers that share no (k-1)-overlap
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap());
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap()); // same canonical as AAAAA — dedup
// They collapse to one canonical node
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap().canonical(k));
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap().canonical(k)); // same canonical as AAAAA — dedup
// They collapse to one canonical node
assert_eq!(g.len(), 1);
}
@@ -468,8 +488,8 @@ mod tests {
fn unitig_two_truly_distinct_isolated_nodes() {
let k = 5;
let mut g = GraphDeBruijn::new(k);
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap());
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap());
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap().canonical(k));
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap().canonical(k));
g.compute_degrees();
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
// Each isolated node → one unitig of length k