feat: enforce canonical k-mer representation throughout the codebase
Refactor core types to consistently use `CanonicalKMer` (lexicographically minimal of k-mer and its reverse complement) as the canonical representation, ensuring deterministic behavior in graph traversal (unitig decomposition), neighbor resolution (`unique_neighbor` with `[CanonicalKmer; 4]` input) and scatter output generation. Introduce `RoutableSuperKmer`, add `.seq_hash()` support, fix type syntax errors in unitig extraction methods and deduplication tests. Update all k-mer construction to use canonical-aware APIs, including unsafe unchecked constructors for performance-critical paths.
This commit is contained in:
@@ -5,7 +5,7 @@ use std::path::{Path, PathBuf};
|
||||
use tracing::{debug, info};
|
||||
|
||||
use memmap2::MmapMut;
|
||||
use obikseq::kmer::Kmer;
|
||||
use obikseq::kmer::CanonicalKmer;
|
||||
use ph::fmph::GOFunction;
|
||||
|
||||
use sysinfo::System;
|
||||
@@ -509,7 +509,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
debug!("{}: sidecar capacity estimate={capacity}", dir.display());
|
||||
|
||||
// Pass 1: collect all unique canonical kmers.
|
||||
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
|
||||
let mut seen: HashSet<CanonicalKmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
@@ -520,7 +520,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
}
|
||||
}
|
||||
}
|
||||
let kmers: Vec<Kmer> = seen.into_iter().collect();
|
||||
let kmers: Vec<CanonicalKmer> = seen.into_iter().collect();
|
||||
let n_kmers = kmers.len();
|
||||
debug!(
|
||||
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",
|
||||
|
||||
Reference in New Issue
Block a user