feat: enforce canonical k-mer representation throughout the codebase

Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
Eric Coissac
2026-05-01 13:34:55 +02:00
parent 21ddbf1674
commit defeeb9460
12 changed files with 235 additions and 113 deletions
+12 -11
View File
@@ -34,7 +34,7 @@ mod fasta;
use std::io::{self, Write};
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
use obikseq::{kmer::CanonicalKmer, superkmer::SuperKmer, unitig::Unitig};
use xxhash_rust::xxh64::xxh64;
// ── public API ────────────────────────────────────────────────────────────────
@@ -57,7 +57,7 @@ pub fn write_scatter<W: Write>(
k: usize,
m: usize,
partition: usize,
minimizer: Kmer,
minimizer: CanonicalKmer,
) -> io::Result<()> {
let ascii = sk.to_ascii();
let id = seq_id(&ascii);
@@ -154,6 +154,7 @@ fn seq_id(ascii: &[u8]) -> String {
#[cfg(test)]
mod tests {
use super::*;
use obikseq::kmer::Kmer;
use obikseq::superkmer::SuperKmer;
fn make(seq: &[u8]) -> SuperKmer {
@@ -171,7 +172,7 @@ mod tests {
#[test]
fn scatter_header_contains_minimizer_field() {
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, CanonicalKmer::from_raw_unchecked(0)));
assert!(out.contains("\"minimizer\":\""));
assert!(!out.contains("\"count\":"));
}
@@ -180,14 +181,14 @@ mod tests {
fn scatter_minimizer_decoded_from_hash() {
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, CanonicalKmer::from_raw_unchecked(Kmer::from_raw_right(6, 3).raw())));
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
}
#[test]
fn scatter_fields_present() {
let sk = make(b"ACGTACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, CanonicalKmer::from_raw_unchecked(0)));
assert!(out.contains("\"seq_length\":12"));
assert!(out.contains("\"kmer_size\":4"));
assert!(out.contains("\"minimizer_size\":3"));
@@ -197,7 +198,7 @@ mod tests {
#[test]
fn scatter_sequence_line_correct() {
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
let lines: Vec<&str> = out.lines().collect();
assert_eq!(lines[1], "ACGTACGT");
}
@@ -240,7 +241,7 @@ mod tests {
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"ACGTACGT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -248,7 +249,7 @@ mod tests {
.next()
.unwrap()[1..]
.to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -264,7 +265,7 @@ mod tests {
let sk1 = make(b"ACGTACGT");
let sk2 = make(b"TTTTTTTT");
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -272,7 +273,7 @@ mod tests {
.next()
.unwrap()[1..]
.to_string();
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, Kmer::from_raw(0)))
let id2 = capture(|w| write_scatter(&sk2, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)))
.lines()
.next()
.unwrap()
@@ -286,7 +287,7 @@ mod tests {
#[test]
fn id_is_16_hex_digits() {
let sk = make(b"ACGTACGT");
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, CanonicalKmer::from_raw_unchecked(0)));
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
assert_eq!(id.len(), 16);
assert!(id.chars().all(|c| c.is_ascii_hexdigit()));