feat: enforce canonical k-mer representation throughout the codebase

Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
Eric Coissac
2026-05-01 13:34:55 +02:00
parent 21ddbf1674
commit defeeb9460
12 changed files with 235 additions and 113 deletions
+3 -3
View File
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
use tracing::{debug, info};
use memmap2::MmapMut;
use obikseq::kmer::Kmer;
use obikseq::kmer::CanonicalKmer;
use ph::fmph::GOFunction;
use sysinfo::System;
@@ -509,7 +509,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
debug!("{}: sidecar capacity estimate={capacity}", dir.display());
// Pass 1: collect all unique canonical kmers.
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
let mut seen: HashSet<CanonicalKmer> = HashSet::with_capacity(capacity);
let mut pass1_superkmers: u64 = 0;
{
let mut reader = SKFileReader::open(dedup_path, k)?;
@@ -520,7 +520,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
}
}
}
let kmers: Vec<Kmer> = seen.into_iter().collect();
let kmers: Vec<CanonicalKmer> = seen.into_iter().collect();
let n_kmers = kmers.len();
debug!(
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",