feat: enforce canonical k-mer representation throughout the codebase
Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
+60
-40
@@ -1,7 +1,7 @@
|
||||
use ahash::RandomState;
|
||||
use hashbrown::HashMap;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::kmer::{CanonicalKmer, Kmer};
|
||||
use obikseq::unitig::Unitig;
|
||||
use std::cell::Cell;
|
||||
use std::io;
|
||||
@@ -76,7 +76,7 @@ impl Node {
|
||||
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct GraphDeBruijn {
|
||||
nodes: FastHashMap<Kmer, Cell<Node>>,
|
||||
nodes: FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
@@ -95,10 +95,10 @@ impl GraphDeBruijn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert `kmer` (canonicalised) into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: Kmer) {
|
||||
/// Insert a canonical kmer into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: CanonicalKmer) {
|
||||
self.nodes
|
||||
.entry(kmer.canonical(self.k))
|
||||
.entry(kmer)
|
||||
.or_insert_with(|| Cell::new(Node::default()));
|
||||
}
|
||||
|
||||
@@ -108,8 +108,8 @@ impl GraphDeBruijn {
|
||||
/// Single pass thanks to Cell interior mutability.
|
||||
pub fn compute_degrees(&self) {
|
||||
for (&kmer, cell) in &self.nodes {
|
||||
let right_nuc = unique_neighbor(&kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||
let left_nuc = unique_neighbor(&kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||
let right_nuc = unique_neighbor(kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||
let left_nuc = unique_neighbor(kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||
|
||||
let mut node = cell.get();
|
||||
node.set_right(right_nuc);
|
||||
@@ -136,8 +136,8 @@ impl GraphDeBruijn {
|
||||
if node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
let start = if !node.can_extend_left() {
|
||||
kmer
|
||||
let start: Kmer = if !node.can_extend_left() {
|
||||
kmer.into_kmer()
|
||||
} else if !node.can_extend_right() {
|
||||
kmer.revcomp(k)
|
||||
} else {
|
||||
@@ -159,7 +159,7 @@ impl GraphDeBruijn {
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(kmer)
|
||||
Some(kmer.into_kmer())
|
||||
});
|
||||
|
||||
chain_starts.chain(cycle_starts)
|
||||
@@ -174,16 +174,21 @@ impl GraphDeBruijn {
|
||||
///
|
||||
/// The returned kmer is oriented so that its first k-1 bases match
|
||||
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
|
||||
pub fn next_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical(self.k);
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let next = if kmer == canonical {
|
||||
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next: CanonicalKmer = if kmer.raw() == canonical.raw() {
|
||||
if !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
// push_right gives the raw right extension (non-canonical) that properly extends kmer
|
||||
// push_right gives the raw right extension that properly extends kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
} else {
|
||||
@@ -192,6 +197,7 @@ impl GraphDeBruijn {
|
||||
}
|
||||
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
};
|
||||
@@ -204,8 +210,8 @@ impl GraphDeBruijn {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = if kmer.is_overlapping(next, self.k) {
|
||||
next
|
||||
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) {
|
||||
next.into_kmer()
|
||||
} else {
|
||||
next.revcomp(self.k)
|
||||
};
|
||||
@@ -227,10 +233,7 @@ impl GraphDeBruijn {
|
||||
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
|
||||
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
|
||||
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
UnitigIter::new(self, start)
|
||||
}
|
||||
|
||||
/// Iterate over all unitigs in the graph.
|
||||
@@ -275,6 +278,17 @@ impl GraphDeBruijn {
|
||||
pub struct UnitigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
at_start: bool,
|
||||
}
|
||||
|
||||
impl<'a> UnitigIter<'a> {
|
||||
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
current: Some(start),
|
||||
at_start: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnitigIter<'a> {
|
||||
@@ -282,8 +296,9 @@ impl<'a> Iterator for UnitigIter<'a> {
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
// next_kmer handles visited marking and cycle detection
|
||||
self.current = self.graph.next_kmer(current);
|
||||
// next_unitig_kmer handles visited marking and cycle detection
|
||||
self.current = self.graph.next_unitig_kmer(current, self.at_start);
|
||||
self.at_start = false;
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
@@ -293,7 +308,10 @@ impl<'a> Iterator for UnitigIter<'a> {
|
||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||
/// zero or ≥2 existing neighbours.
|
||||
fn unique_neighbor(neighbors: &[Kmer; 4], nodes: &FastHashMap<Kmer, Cell<Node>>) -> Option<u8> {
|
||||
fn unique_neighbor(
|
||||
neighbors: [CanonicalKmer; 4],
|
||||
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
) -> Option<u8> {
|
||||
let mut found: Option<u8> = None;
|
||||
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||
if nodes.contains_key(neighbour) {
|
||||
@@ -316,15 +334,14 @@ mod tests {
|
||||
fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn {
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
let kmer = Kmer::from_ascii(&seq[i..i + k], k).unwrap();
|
||||
g.push(kmer);
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
|
||||
}
|
||||
g
|
||||
}
|
||||
|
||||
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
|
||||
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<Kmer> {
|
||||
let mut v: Vec<Kmer> = (0..=seq.len().saturating_sub(k))
|
||||
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = (0..=seq.len().saturating_sub(k))
|
||||
.map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k))
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
@@ -338,10 +355,9 @@ mod tests {
|
||||
fn push_deduplicates_revcomp() {
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let rc = kmer.revcomp(k);
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(rc);
|
||||
g.push(kmer.canonical(k));
|
||||
g.push(kmer.revcomp(k).canonical(k));
|
||||
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
|
||||
}
|
||||
|
||||
@@ -352,7 +368,7 @@ mod tests {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", k).unwrap();
|
||||
assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome");
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(kmer.canonical(k));
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
@@ -360,7 +376,7 @@ mod tests {
|
||||
|
||||
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
|
||||
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
|
||||
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<Kmer>) {
|
||||
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<CanonicalKmer>) {
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
let kmers = canonical_kmers(seq, k);
|
||||
@@ -386,7 +402,11 @@ mod tests {
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
|
||||
assert_eq!(unitigs[0].seql(), seq.len(), "unitig spans the full sequence");
|
||||
assert_eq!(
|
||||
unitigs[0].seql(),
|
||||
seq.len(),
|
||||
"unitig spans the full sequence"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs, k),
|
||||
canonical_kmers(seq, k),
|
||||
@@ -397,8 +417,8 @@ mod tests {
|
||||
// ── unitig reconstruction ─────────────────────────────────────────────────
|
||||
|
||||
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<Kmer> {
|
||||
let mut v: Vec<Kmer> = unitigs
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = unitigs
|
||||
.iter()
|
||||
.flat_map(|u| u.iter_canonical_kmers(k))
|
||||
.collect();
|
||||
@@ -446,7 +466,7 @@ mod tests {
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(kmer.canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1);
|
||||
@@ -458,9 +478,9 @@ mod tests {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
// Two k-mers that share no (k-1)-overlap
|
||||
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap()); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap().canonical(k)); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
@@ -468,8 +488,8 @@ mod tests {
|
||||
fn unitig_two_truly_distinct_isolated_nodes() {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap().canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
// Each isolated node → one unitig of length k
|
||||
|
||||
Reference in New Issue
Block a user