feat: enforce canonical k-mer representation throughout the codebase
Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
+60
-40
@@ -1,7 +1,7 @@
|
||||
use ahash::RandomState;
|
||||
use hashbrown::HashMap;
|
||||
use obifastwrite::write_unitig;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::kmer::{CanonicalKmer, Kmer};
|
||||
use obikseq::unitig::Unitig;
|
||||
use std::cell::Cell;
|
||||
use std::io;
|
||||
@@ -76,7 +76,7 @@ impl Node {
|
||||
// ── GraphDeBruijn ─────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct GraphDeBruijn {
|
||||
nodes: FastHashMap<Kmer, Cell<Node>>,
|
||||
nodes: FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
k: usize,
|
||||
}
|
||||
|
||||
@@ -95,10 +95,10 @@ impl GraphDeBruijn {
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert `kmer` (canonicalised) into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: Kmer) {
|
||||
/// Insert a canonical kmer into the graph. No-op if already present.
|
||||
pub fn push(&mut self, kmer: CanonicalKmer) {
|
||||
self.nodes
|
||||
.entry(kmer.canonical(self.k))
|
||||
.entry(kmer)
|
||||
.or_insert_with(|| Cell::new(Node::default()));
|
||||
}
|
||||
|
||||
@@ -108,8 +108,8 @@ impl GraphDeBruijn {
|
||||
/// Single pass thanks to Cell interior mutability.
|
||||
pub fn compute_degrees(&self) {
|
||||
for (&kmer, cell) in &self.nodes {
|
||||
let right_nuc = unique_neighbor(&kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||
let left_nuc = unique_neighbor(&kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||
let right_nuc = unique_neighbor(kmer.right_canonical_neighbors(self.k), &self.nodes);
|
||||
let left_nuc = unique_neighbor(kmer.left_canonical_neighbors(self.k), &self.nodes);
|
||||
|
||||
let mut node = cell.get();
|
||||
node.set_right(right_nuc);
|
||||
@@ -136,8 +136,8 @@ impl GraphDeBruijn {
|
||||
if node.is_visited() {
|
||||
return None;
|
||||
}
|
||||
let start = if !node.can_extend_left() {
|
||||
kmer
|
||||
let start: Kmer = if !node.can_extend_left() {
|
||||
kmer.into_kmer()
|
||||
} else if !node.can_extend_right() {
|
||||
kmer.revcomp(k)
|
||||
} else {
|
||||
@@ -159,7 +159,7 @@ impl GraphDeBruijn {
|
||||
let mut updated = node;
|
||||
updated.set_visited();
|
||||
cell.set(updated);
|
||||
Some(kmer)
|
||||
Some(kmer.into_kmer())
|
||||
});
|
||||
|
||||
chain_starts.chain(cycle_starts)
|
||||
@@ -174,16 +174,21 @@ impl GraphDeBruijn {
|
||||
///
|
||||
/// The returned kmer is oriented so that its first k-1 bases match
|
||||
/// the last k-1 bases of `kmer` (proper De Bruijn overlap).
|
||||
pub fn next_kmer(&self, kmer: Kmer) -> Option<Kmer> {
|
||||
pub fn next_unitig_kmer(&self, kmer: Kmer, is_at_start: bool) -> Option<Kmer> {
|
||||
let canonical = kmer.canonical(self.k);
|
||||
let node = self.nodes.get(&canonical)?.get();
|
||||
|
||||
let next = if kmer == canonical {
|
||||
if !is_at_start && (!node.can_extend_left() || !node.can_extend_right()) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next: CanonicalKmer = if kmer.raw() == canonical.raw() {
|
||||
if !node.can_extend_right() {
|
||||
return None;
|
||||
}
|
||||
// push_right gives the raw right extension (non-canonical) that properly extends kmer
|
||||
// push_right gives the raw right extension that properly extends kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_right(node.right_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
} else {
|
||||
@@ -192,6 +197,7 @@ impl GraphDeBruijn {
|
||||
}
|
||||
// push_left gives the left extension of canonical; its revcomp is the right extension of kmer
|
||||
canonical
|
||||
.into_kmer()
|
||||
.push_left(node.left_nuc(), self.k)
|
||||
.canonical(self.k)
|
||||
};
|
||||
@@ -204,8 +210,8 @@ impl GraphDeBruijn {
|
||||
return None;
|
||||
}
|
||||
|
||||
let oriented = if kmer.is_overlapping(next, self.k) {
|
||||
next
|
||||
let oriented = if kmer.is_overlapping(next.into_kmer(), self.k) {
|
||||
next.into_kmer()
|
||||
} else {
|
||||
next.revcomp(self.k)
|
||||
};
|
||||
@@ -227,10 +233,7 @@ impl GraphDeBruijn {
|
||||
/// - first item → `kmer.nucleotide(0..k)` (k bases, the seed)
|
||||
/// - next items → `kmer.nucleotide(k-1)` (1 new base each)
|
||||
pub fn iter_unitig_kmers(&self, start: Kmer) -> UnitigIter<'_> {
|
||||
UnitigIter {
|
||||
graph: self,
|
||||
current: Some(start),
|
||||
}
|
||||
UnitigIter::new(self, start)
|
||||
}
|
||||
|
||||
/// Iterate over all unitigs in the graph.
|
||||
@@ -275,6 +278,17 @@ impl GraphDeBruijn {
|
||||
pub struct UnitigIter<'a> {
|
||||
graph: &'a GraphDeBruijn,
|
||||
current: Option<Kmer>,
|
||||
at_start: bool,
|
||||
}
|
||||
|
||||
impl<'a> UnitigIter<'a> {
|
||||
fn new(graph: &'a GraphDeBruijn, start: Kmer) -> Self {
|
||||
Self {
|
||||
graph,
|
||||
current: Some(start),
|
||||
at_start: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for UnitigIter<'a> {
|
||||
@@ -282,8 +296,9 @@ impl<'a> Iterator for UnitigIter<'a> {
|
||||
|
||||
fn next(&mut self) -> Option<Kmer> {
|
||||
let current = self.current?;
|
||||
// next_kmer handles visited marking and cycle detection
|
||||
self.current = self.graph.next_kmer(current);
|
||||
// next_unitig_kmer handles visited marking and cycle detection
|
||||
self.current = self.graph.next_unitig_kmer(current, self.at_start);
|
||||
self.at_start = false;
|
||||
Some(current)
|
||||
}
|
||||
}
|
||||
@@ -293,7 +308,10 @@ impl<'a> Iterator for UnitigIter<'a> {
|
||||
/// Returns `Some(i)` if exactly one of the four canonical neighbours exists in
|
||||
/// the graph, where `i` is its index (0=A, 1=C, 2=G, 3=T). Returns `None` for
|
||||
/// zero or ≥2 existing neighbours.
|
||||
fn unique_neighbor(neighbors: &[Kmer; 4], nodes: &FastHashMap<Kmer, Cell<Node>>) -> Option<u8> {
|
||||
fn unique_neighbor(
|
||||
neighbors: [CanonicalKmer; 4],
|
||||
nodes: &FastHashMap<CanonicalKmer, Cell<Node>>,
|
||||
) -> Option<u8> {
|
||||
let mut found: Option<u8> = None;
|
||||
for (i, neighbour) in neighbors.iter().enumerate() {
|
||||
if nodes.contains_key(neighbour) {
|
||||
@@ -316,15 +334,14 @@ mod tests {
|
||||
fn graph_from_ascii(seq: &[u8], k: usize) -> GraphDeBruijn {
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
for i in 0..=seq.len().saturating_sub(k) {
|
||||
let kmer = Kmer::from_ascii(&seq[i..i + k], k).unwrap();
|
||||
g.push(kmer);
|
||||
g.push(Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k));
|
||||
}
|
||||
g
|
||||
}
|
||||
|
||||
// Collect all canonical k-mers from an ASCII sequence into a sorted vec.
|
||||
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<Kmer> {
|
||||
let mut v: Vec<Kmer> = (0..=seq.len().saturating_sub(k))
|
||||
fn canonical_kmers(seq: &[u8], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = (0..=seq.len().saturating_sub(k))
|
||||
.map(|i| Kmer::from_ascii(&seq[i..i + k], k).unwrap().canonical(k))
|
||||
.collect();
|
||||
v.sort_unstable();
|
||||
@@ -338,10 +355,9 @@ mod tests {
|
||||
fn push_deduplicates_revcomp() {
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let rc = kmer.revcomp(k);
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(rc);
|
||||
g.push(kmer.canonical(k));
|
||||
g.push(kmer.revcomp(k).canonical(k));
|
||||
assert_eq!(g.len(), 1, "kmer and its revcomp must map to the same node");
|
||||
}
|
||||
|
||||
@@ -352,7 +368,7 @@ mod tests {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", k).unwrap();
|
||||
assert_eq!(kmer, kmer.revcomp(k), "test requires a palindrome");
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(kmer.canonical(k));
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
@@ -360,7 +376,7 @@ mod tests {
|
||||
|
||||
// AAAAGGGG with k=5 → 4 distinct k-mers (AAAAG, AAAGG, AAGGG, AGGGG),
|
||||
// clean linear chain, no Watson-Crick palindrome in first k-1 bases.
|
||||
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<Kmer>) {
|
||||
fn linear_chain_graph(k: usize) -> (GraphDeBruijn, Vec<CanonicalKmer>) {
|
||||
let seq = b"AAAAGGGG";
|
||||
let g = graph_from_ascii(seq, k);
|
||||
let kmers = canonical_kmers(seq, k);
|
||||
@@ -386,7 +402,11 @@ mod tests {
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1, "linear chain → exactly one unitig");
|
||||
// seql = k + (n_kmers - 1) = 5 + 3 = 8 = seq.len()
|
||||
assert_eq!(unitigs[0].seql(), seq.len(), "unitig spans the full sequence");
|
||||
assert_eq!(
|
||||
unitigs[0].seql(),
|
||||
seq.len(),
|
||||
"unitig spans the full sequence"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers_from_unitigs(&unitigs, k),
|
||||
canonical_kmers(seq, k),
|
||||
@@ -397,8 +417,8 @@ mod tests {
|
||||
// ── unitig reconstruction ─────────────────────────────────────────────────
|
||||
|
||||
// Round-trip: all canonical k-mers in the unitigs == all canonical k-mers inserted.
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<Kmer> {
|
||||
let mut v: Vec<Kmer> = unitigs
|
||||
fn kmers_from_unitigs(unitigs: &[Unitig], k: usize) -> Vec<CanonicalKmer> {
|
||||
let mut v: Vec<CanonicalKmer> = unitigs
|
||||
.iter()
|
||||
.flat_map(|u| u.iter_canonical_kmers(k))
|
||||
.collect();
|
||||
@@ -446,7 +466,7 @@ mod tests {
|
||||
let k = 5;
|
||||
let kmer = Kmer::from_ascii(b"ACGTA", k).unwrap();
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(kmer);
|
||||
g.push(kmer.canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
assert_eq!(unitigs.len(), 1);
|
||||
@@ -458,9 +478,9 @@ mod tests {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
// Two k-mers that share no (k-1)-overlap
|
||||
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap()); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
g.push(Kmer::from_ascii(b"AAAAA", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"TTTTT", k).unwrap().canonical(k)); // same canonical as AAAAA — dedup
|
||||
// They collapse to one canonical node
|
||||
assert_eq!(g.len(), 1);
|
||||
}
|
||||
|
||||
@@ -468,8 +488,8 @@ mod tests {
|
||||
fn unitig_two_truly_distinct_isolated_nodes() {
|
||||
let k = 5;
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap());
|
||||
g.push(Kmer::from_ascii(b"AAAAC", k).unwrap().canonical(k));
|
||||
g.push(Kmer::from_ascii(b"GGGGT", k).unwrap().canonical(k));
|
||||
g.compute_degrees();
|
||||
let unitigs: Vec<Unitig> = g.iter_unitig().collect();
|
||||
// Each isolated node → one unitig of length k
|
||||
|
||||
+12
-11
@@ -34,7 +34,7 @@ mod fasta;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
use obikseq::{kmer::CanonicalKmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
// ── public API ────────────────────────────────────────────────────────────────
|
||||
@@ -57,7 +57,7 @@ pub fn write_scatter<W: Write>(
|
||||
k: usize,
|
||||
m: usize,
|
||||
partition: usize,
|
||||
minimizer: Kmer,
|
||||
minimizer: CanonicalKmer,
|
||||
) -> io::Result<()> {
|
||||
let ascii = sk.to_ascii();
|
||||
let id = seq_id(&ascii);
|
||||
@@ -154,6 +154,7 @@ fn seq_id(ascii: &[u8]) -> String {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
|
||||
fn make(seq: &[u8]) -> SuperKmer {
|
||||
@@ -171,7 +172,7 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, CanonicalKmer::from_raw_unchecked(0)));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
}
|
||||
@@ -180,14 +181,14 @@ mod tests {
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(6, 3).raw())));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, CanonicalKmer::from_raw_unchecked(0)));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
assert!(out.contains("\"minimizer_size\":3"));
|
||||
@@ -197,7 +198,7 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
}
|
||||
@@ -240,7 +241,7 @@ mod tests {
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"ACGTACGT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -248,7 +249,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -264,7 +265,7 @@ mod tests {
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -272,7 +273,7 @@ mod tests {
|
||||
.next()
|
||||
.unwrap()[1..]
|
||||
.to_string();
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
|
||||
.lines()
|
||||
.next()
|
||||
.unwrap()
|
||||
@@ -286,7 +287,7 @@ mod tests {
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use memmap2::MmapMut;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::kmer::CanonicalKmer;
|
||||
use ph::fmph::GOFunction;
|
||||
|
||||
use sysinfo::System;
|
||||
@@ -509,7 +509,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
debug!("{}: sidecar capacity estimate={capacity}", dir.display());
|
||||
|
||||
// Pass 1: collect all unique canonical kmers.
|
||||
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
|
||||
let mut seen: HashSet<CanonicalKmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
@@ -520,7 +520,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
let kmers: Vec<Kmer> = seen.into_iter().collect();
|
||||
let kmers: Vec<CanonicalKmer> = seen.into_iter().collect();
|
||||
let n_kmers = kmers.len();
|
||||
debug!(
|
||||
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",
|
||||
|
||||
+103
-45
@@ -151,44 +151,9 @@ impl Kmer {
|
||||
/// Return the canonical form: lexicographic minimum of forward and reverse complement.
|
||||
/// Zero allocation — result lives on the stack.
|
||||
#[inline]
|
||||
pub fn canonical(&self, k: usize) -> Self {
|
||||
pub fn canonical(&self, k: usize) -> CanonicalKmer {
|
||||
let rc = self.revcomp(k);
|
||||
if self.0 <= rc.0 { *self } else { rc }
|
||||
}
|
||||
|
||||
/// Return a hash of this kmer.
|
||||
///
|
||||
/// Uses the canonical form of the kmer to compute the hash.
|
||||
#[inline]
|
||||
pub fn seq_hash(&self, k: usize) -> u64 {
|
||||
mix64(self.canonical(k).0)
|
||||
}
|
||||
|
||||
/// Return the left canonical neighbors of this kmer.
|
||||
///
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn left_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
|
||||
let shifted = (self.0 >> 2) & (!0u64 << (64 - 2 * k));
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (2u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (3u64 << 62)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Return the right canonical neighbors of this kmer.
|
||||
///
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn right_canonical_neighbors(&self, k: usize) -> [Kmer; 4] {
|
||||
let shifted = self.0 << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (2u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||
]
|
||||
CanonicalKmer(if self.0 <= rc.0 { *self } else { rc })
|
||||
}
|
||||
|
||||
/// Slide the window one base to the right: drop the first nucleotide, append `nuc` at position k-1.
|
||||
@@ -215,6 +180,99 @@ impl Kmer {
|
||||
}
|
||||
}
|
||||
|
||||
// ── CanonicalKmer ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// A [`Kmer`] guaranteed to be in canonical form (lexicographic minimum of
|
||||
/// forward and reverse complement).
|
||||
///
|
||||
/// The only public constructors are [`Kmer::canonical`] (checked) and
|
||||
/// [`CanonicalKmer::from_raw_unchecked`] (for trusted paths such as
|
||||
/// deserialisation or rolling-window minimizer extraction).
|
||||
#[repr(transparent)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct CanonicalKmer(Kmer);
|
||||
|
||||
impl CanonicalKmer {
|
||||
/// Wrap a raw left-aligned u64 without verifying the canonical invariant.
|
||||
///
|
||||
/// # Safety (logical)
|
||||
/// The caller must guarantee that `raw == min(raw, revcomp(raw, k))`.
|
||||
/// Violations cause silently wrong results in MPHF lookup and graph traversal.
|
||||
#[inline]
|
||||
pub fn from_raw_unchecked(raw: u64) -> Self {
|
||||
CanonicalKmer(Kmer(raw))
|
||||
}
|
||||
|
||||
/// Return the raw left-aligned u64 value.
|
||||
#[inline]
|
||||
pub fn raw(&self) -> u64 {
|
||||
self.0.0
|
||||
}
|
||||
|
||||
/// Decode into a freshly allocated ASCII `Vec<u8>`.
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
self.0.to_ascii(k)
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, writing into `writer`.
|
||||
#[inline]
|
||||
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
|
||||
self.0.write_ascii(k, writer)
|
||||
}
|
||||
|
||||
/// Compute the reverse complement. The result is a raw [`Kmer`] — the
|
||||
/// revcomp of a canonical kmer is not necessarily canonical itself.
|
||||
#[inline]
|
||||
pub fn revcomp(&self, k: usize) -> Kmer {
|
||||
self.0.revcomp(k)
|
||||
}
|
||||
|
||||
/// Hash via `mix64`. No re-canonicalisation needed.
|
||||
#[inline]
|
||||
pub fn seq_hash(&self, _k: usize) -> u64 {
|
||||
mix64(self.0.0)
|
||||
}
|
||||
|
||||
/// Return the four left canonical neighbours (each already canonical).
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn left_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
||||
let shifted = (self.raw() >> 2) & (!0u64 << (64 - 2 * k));
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (2u64 << 62)).canonical(k),
|
||||
Kmer(shifted | (3u64 << 62)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Return the four right canonical neighbours (each already canonical).
|
||||
/// Zero allocation — result lives on the stack.
|
||||
pub fn right_canonical_neighbors(&self, k: usize) -> [CanonicalKmer; 4] {
|
||||
let shifted = self.raw() << 2 & (!0u64 << (64 - 2 * (k - 1)));
|
||||
let shift = 64 - 2 * k;
|
||||
[
|
||||
Kmer(shifted).canonical(k),
|
||||
Kmer(shifted | (1u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (2u64 << shift)).canonical(k),
|
||||
Kmer(shifted | (3u64 << shift)).canonical(k),
|
||||
]
|
||||
}
|
||||
|
||||
/// Consume this wrapper and return the inner raw [`Kmer`].
|
||||
#[inline]
|
||||
pub fn into_kmer(self) -> Kmer {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<CanonicalKmer> for Kmer {
|
||||
#[inline]
|
||||
fn from(ck: CanonicalKmer) -> Self {
|
||||
ck.0
|
||||
}
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -334,33 +392,33 @@ mod tests {
|
||||
#[test]
|
||||
fn canonical_palindrome() {
|
||||
let kmer = Kmer::from_ascii(b"ACGT", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), kmer);
|
||||
assert_eq!(kmer.canonical(4).into_kmer(), kmer);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_lesser() {
|
||||
let kmer = Kmer::from_ascii(b"TTTT", 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(kmer.canonical(4), expected);
|
||||
assert_eq!(kmer.canonical(4).into_kmer(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal() {
|
||||
for &k in K_VALUES {
|
||||
let ascii = make_seq(k);
|
||||
let kmer = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = kmer.revcomp(k);
|
||||
assert!(kmer.0 <= rc.0, "canonical not minimal for k={k}");
|
||||
let ck = Kmer::from_ascii(&ascii, k).unwrap().canonical(k);
|
||||
let rc = ck.revcomp(k);
|
||||
assert!(ck.raw() <= rc.raw(), "canonical not minimal for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_idempotent() {
|
||||
for &k in K_VALUES {
|
||||
let kmer = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
let ck = Kmer::from_ascii(&make_seq(k), k).unwrap().canonical(k);
|
||||
assert_eq!(
|
||||
kmer.canonical(k),
|
||||
kmer,
|
||||
ck.into_kmer().canonical(k),
|
||||
ck,
|
||||
"canonical not idempotent for k={k}"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -17,5 +17,6 @@ pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use kmer::CanonicalKmer;
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use superkmer::SuperKmer;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
|
||||
|
||||
use super::kmer::Kmer;
|
||||
use super::kmer::CanonicalKmer;
|
||||
use super::SuperKmer;
|
||||
|
||||
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
|
||||
@@ -13,7 +13,7 @@ use super::SuperKmer;
|
||||
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
|
||||
pub struct RoutableSuperKmer {
|
||||
superkmer: SuperKmer,
|
||||
minimizer: Kmer,
|
||||
minimizer: CanonicalKmer,
|
||||
}
|
||||
|
||||
impl RoutableSuperKmer {
|
||||
@@ -43,7 +43,7 @@ impl RoutableSuperKmer {
|
||||
}
|
||||
|
||||
/// Borrow the canonical minimizer kmer.
|
||||
pub fn minimizer(&self) -> &Kmer {
|
||||
pub fn minimizer(&self) -> &CanonicalKmer {
|
||||
&self.minimizer
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io::{self, Write};
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
use xxhash_rust::xxh3::xxh3_64;
|
||||
@@ -275,7 +275,7 @@ impl SuperKmer {
|
||||
/// Extract the canonical kmer of length k starting at nucleotide position i (0-based).
|
||||
///
|
||||
/// Returns an error if k is invalid (0 or > 32) or if position i + k exceeds the sequence length.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
@@ -312,7 +312,7 @@ impl SuperKmer {
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ fn canonical_kmer_palindrome_unchanged() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
assert_eq!(ck.into_kmer(), fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -99,7 +99,7 @@ fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
assert_eq!(ck.into_kmer(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::kmer::{CanonicalKmer, Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
use bitvec::prelude::*;
|
||||
|
||||
@@ -198,7 +198,7 @@ impl Unitig {
|
||||
}
|
||||
|
||||
/// Extract the canonical kmer of length `k` starting at position `i`.
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<Kmer, KmerError> {
|
||||
pub fn canonical_kmer(&self, i: usize, k: usize) -> Result<CanonicalKmer, KmerError> {
|
||||
Ok(self.kmer(i, k)?.canonical(k))
|
||||
}
|
||||
|
||||
@@ -208,7 +208,7 @@ impl Unitig {
|
||||
}
|
||||
|
||||
/// Iterate over all canonical kmers of length `k` in order.
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = Kmer> + '_ {
|
||||
pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
|
||||
self.iter_kmers(k).map(move |km| km.canonical(k))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::kmer::CanonicalKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
@@ -29,7 +30,7 @@ pub struct SuperKmerIter<'a> {
|
||||
theta: f64,
|
||||
scratch: SuperKmerScratch,
|
||||
stat: RollingStat,
|
||||
prev_min: Option<u64>,
|
||||
prev_min: Option<CanonicalKmer>,
|
||||
prev_min_pos: usize,
|
||||
}
|
||||
|
||||
@@ -107,7 +108,7 @@ impl Iterator for SuperKmerIter<'_> {
|
||||
continue;
|
||||
}
|
||||
|
||||
let min = self.stat.canonical_minimizer_raw().unwrap_or(0);
|
||||
let min = self.stat.canonical_minimizer().unwrap(); // always Some after ready()
|
||||
let min_pos = self.stat.minimizer_position().unwrap_or(0);
|
||||
|
||||
// ── 2. Minimizer change check ─────────────────────────────────────
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::kmer::{CanonicalKmer, Kmer};
|
||||
|
||||
use crate::encoding::encode_nuc;
|
||||
use crate::entropy_table::{WS_MAX, emax, entropy_norm_kmer, ln_class_size, log_nwords, n_log_n};
|
||||
@@ -283,6 +283,12 @@ impl RollingStat {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn canonical_minimizer(&self) -> Option<CanonicalKmer> {
|
||||
self.canonical_minimizer_raw().map(|raw| {
|
||||
CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(raw, self.m).raw())
|
||||
})
|
||||
}
|
||||
|
||||
pub fn entropy(&self, order: usize) -> Option<f64> {
|
||||
if !self.ready() {
|
||||
return None;
|
||||
|
||||
Reference in New Issue
Block a user