diff --git a/CLAUDE.md b/CLAUDE.md index de49776..5d78c77 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,6 +5,9 @@ Tu es ma base de connaissance et mon bloc-notes intelligent sur le projet **obik **Règle absolue : une question appelle une réponse, pas une action.** Ne modifier aucun fichier à moins d'une demande explicite de modification. En particulier : observer un bug ou une incohérence dans le code montré ne constitue pas un mandat pour le corriger. Le code montré peut refléter une intention en cours — modifier sans mandat risque d'introduire un vrai bug là où tu croyais corriger. +**Règle absolue : ne jamais substituer une dépendance ou une bibliothèque sans validation explicite.** +Si une dépendance demandée pose problème (erreur de compilation, bug, API manquante), exposer le problème et proposer des alternatives — ne jamais switcher silencieusement vers une autre bibliothèque. Le choix des dépendances est une décision d'architecture qui appartient au développeur. + Tu maintiens en **anglais**, dense et sans remplissage, les documents suivants : - `docmd/index.md` — document de discussion de base, enrichi progressivement au fil de nos échanges ; il reflète l'état courant de la réflexion sur le projet - les autres fichiers Markdown dans `docmd/` selon leur thème respectif diff --git a/src/Cargo.lock b/src/Cargo.lock index c41e900..6670645 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -314,6 +314,17 @@ dependencies = [ "pkg-config", ] +[[package]] +name = "cacheline-ef" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af737c6c59cb018ecbe6472cbdf86d39c59d78252febfe311953a991b6e4ed85" +dependencies = [ + "common_traits 0.11.4", + "epserde 0.8.0", + "mem_dbg", +] + [[package]] name = "cast" version = "0.3.0" @@ -437,6 +448,15 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "colored" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" +dependencies = [ + "windows-sys 0.61.2", +] + [[package]] name = "combine" version = "4.6.7" @@ -447,6 +467,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "common_traits" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda9ae1f26adcae83adb2e92f69cf59421f2a277a942f49f8e59f2fcbd7cf062" +dependencies = [ + "anyhow", + "half", + "impl-tools 0.10.3", +] + [[package]] name = "common_traits" version = "0.12.1" @@ -455,7 +486,7 @@ checksum = "65d0a1296e8d359cb197a8f8289f3d3f77cdb67f1a83d0aeb0820a5b7aea4058" dependencies = [ "anyhow", "half", - "impl-tools", + "impl-tools 0.11.4", ] [[package]] @@ -751,6 +782,24 @@ dependencies = [ "log", ] +[[package]] +name = "epserde" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c40d342ff20a2ce62d9a85ce406e672dfa137f902ac9670034533184f1533976" +dependencies = [ + "anyhow", + "bitflags 2.11.1", + "common_traits 0.11.4", + "epserde-derive 0.8.0", + "maligned", + "mem_dbg", + "mmap-rs", + "sealed", + "thiserror 2.0.18", + "xxhash-rust", +] + [[package]] name = "epserde" version = "0.11.5" @@ -759,8 +808,8 @@ checksum = "d8dffc01a379703ad5178f47a22aa532f5811b3ef45979ccd66b79da9856770b" dependencies = [ "anyhow", "bitflags 2.11.1", - "common_traits", - "epserde-derive", + "common_traits 0.12.1", + "epserde-derive 0.11.0", "mem_dbg", "mmap-rs", "sealed", @@ -768,6 +817,17 @@ dependencies = [ "xxhash-rust", ] +[[package]] +name = "epserde-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac80cc78b69765703f48ad93f33b8919cf5d907cda7459ad6ba2919cbbe605dd" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "epserde-derive" version = "0.11.0" @@ -903,6 +963,15 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1101,6 +1170,18 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "impl-tools" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae95c9095c2f1126d7db785955c73cdc5fc33e7c3fa911bd4a42931672029a7" +dependencies = [ + "autocfg", + "impl-tools-lib", + "proc-macro-error2", + "syn 2.0.117", +] + [[package]] name = "impl-tools" version = "0.11.4" @@ -1364,6 +1445,12 @@ dependencies = [ "libc", ] +[[package]] +name = "maligned" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e88c3cbe8288f77f293e48a28b3232e3defd203a6d839fa7f68ea4329e83464" + [[package]] name = "matchers" version = "0.2.0" @@ -1380,6 +1467,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "728cc9dc97593cd22f7bc81fbef70a2d391d7a9a855e7d658b653318124a6cf0" dependencies = [ "bitflags 2.11.1", + "maligned", "mem_dbg-derive", "mmap-rs", ] @@ -1653,10 +1741,12 @@ dependencies = [ name = "obilayeredmap" version = "0.1.0" dependencies = [ + "epserde 0.8.0", "memmap2", "obikseq", "obiskio", - "ph", + "ptr_hash", + "rayon", "serde", "serde_json", "tempfile", @@ -1807,7 +1897,7 @@ dependencies = [ "binout", "bitm", "dyn_size_of", - "epserde", + "epserde 0.11.5", "mem_dbg", "rayon", "seedable_hash", @@ -2045,6 +2135,37 @@ dependencies = [ "prost", ] +[[package]] +name = "ptr_hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b4e4fb9c4c2ba3e5b060f53ef46afd3de37345b08e3ec0f2c65e0ca1d57ccbd" +dependencies = [ + "anyhow", + "bitvec", + "cacheline-ef", + "clap", + "colored", + "common_traits 0.11.4", + "epserde 0.8.0", + "epserde-derive 0.8.0", + "fastrand", + "fxhash", + "itertools 0.14.0", + "lazy_static", + "log", + "mem_dbg", + "rand", + "rand_chacha", + "rayon", + "rdst", + "rustc-hash", + "serde", + "sucds", + "tempfile", + "xxhash-rust", +] + [[package]] name = "quote" version = "1.0.45" @@ -2202,6 +2323,12 @@ version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + [[package]] name = "rustix" version = "1.1.4" @@ -2393,6 +2520,16 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "sucds" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd324eaa05be64f105ea5269bb8aabd70e5dd57fa5c673b167f451b07d6c0dcd" +dependencies = [ + "anyhow", + "num-traits", +] + [[package]] name = "sux" version = "0.10.3" @@ -2404,16 +2541,16 @@ dependencies = [ "arbitrary-chunks", "bitflags 2.11.1", "clap", - "common_traits", + "common_traits 0.12.1", "crossbeam-channel", "derivative", "derive_setters", "dsi-progress-logger", "env_logger", - "epserde", + "epserde 0.11.5", "fallible-iterator", "flate2", - "impl-tools", + "impl-tools 0.11.4", "itertools 0.14.0", "jiff", "lambert_w", diff --git a/src/obilayeredmap/Cargo.toml b/src/obilayeredmap/Cargo.toml index 2487fc3..137bb92 100644 --- a/src/obilayeredmap/Cargo.toml +++ b/src/obilayeredmap/Cargo.toml @@ -6,7 +6,9 @@ edition = "2024" [dependencies] obikseq = { path = "../obikseq" } obiskio = { path = "../obiskio" } -ph = "0.11" +ptr_hash = "1.1" +epserde = "0.8" +rayon = "1" memmap2 = "0.9" serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index d42f289..507bb26 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -1,51 +1,61 @@ use std::collections::HashMap; use std::fs; -use std::io::BufWriter; use std::path::Path; -use obikseq::{CanonicalKmer, Kmer, Sequence}; +use epserde::prelude::*; +use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; -use ph::fmph; +use ptr_hash::{DefaultPtrHash, PtrHashParams}; use crate::counts::{Counts, CountsWriter}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; -const MPHF_FILE: &str = "mphf.bin"; -const UNITIGS_FILE: &str = "unitigs.bin"; +const MPHF_FILE: &str = "mphf.bin"; +const UNITIGS_FILE: &str = "unitigs.bin"; const EVIDENCE_FILE: &str = "evidence.bin"; -const COUNTS_FILE: &str = "counts.bin"; +const COUNTS_FILE: &str = "counts.bin"; pub struct Layer { - mphf: fmph::Function, + mphf: DefaultPtrHash, evidence: Evidence, - unitigs: UnitigFileReader, - counts: Counts, + unitigs: UnitigFileReader, + counts: Counts, } pub struct Hit { - pub slot: usize, + pub slot: usize, pub count: u32, } impl Layer { pub fn open(path: &Path) -> OLMResult { - let mphf = fmph::Function::read( - &mut fs::File::open(path.join(MPHF_FILE))? - ).map_err(OLMError::Io)?; + let mphf: DefaultPtrHash = DefaultPtrHash::load_full(&path.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?; + let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?; let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?; - let counts = Counts::open(&path.join(COUNTS_FILE))?; + let counts = Counts::open(&path.join(COUNTS_FILE))?; - Ok(Self { mphf, evidence, unitigs, counts }) + Ok(Self { + mphf, + evidence, + unitigs, + counts, + }) } pub fn query(&self, kmer: CanonicalKmer) -> Option { - let slot = self.mphf.get(&kmer.raw())? as usize; + let slot = self.mphf.index(&kmer.raw()); let (chunk_id, rank) = self.evidence.decode(slot); - if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { - Some(Hit { slot, count: self.counts.get(slot) }) + if self + .unitigs + .verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) + { + Some(Hit { + slot, + count: self.counts.get(slot), + }) } else { None } @@ -55,43 +65,38 @@ impl Layer { /// /// `count_of` maps each canonical kmer to its occurrence count. /// Returns the number of kmers indexed. - pub fn build( - out_dir: &Path, - count_of: impl Fn(CanonicalKmer) -> u32, - ) -> OLMResult { - let k = obikseq::params::k(); + pub fn build(out_dir: &Path, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { + use rayon::prelude::*; + let unitigs = UnitigFileReader::open(&out_dir.join(UNITIGS_FILE))?; + let n = unitigs.n_kmers(); - let mut entries: Vec<(u64, u32, u8)> = Vec::new(); - for chunk_id in 0..unitigs.len() { - let n_kmers = unitigs.seql(chunk_id) - k + 1; - for rank in 0..n_kmers { - let raw = unitigs.raw_kmer(chunk_id, rank); - let canonical: CanonicalKmer = Kmer::from_raw(raw).canonical(); - entries.push((canonical.raw(), chunk_id as u32, rank as u8)); - } - } - - let n = entries.len(); if n == 0 { fs::File::create(out_dir.join(EVIDENCE_FILE))?; fs::File::create(out_dir.join(COUNTS_FILE))?; - let mphf = fmph::Function::new(Vec::::new()); - mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?; + let mphf: DefaultPtrHash = DefaultPtrHash::new(&[] as &[u64], PtrHashParams::default()); + mphf.store(&out_dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; return Ok(0); } - let keys: Vec = entries.iter().map(|(k, _, _)| *k).collect(); - let mphf = fmph::Function::new(keys); - mphf.write(&mut BufWriter::new(fs::File::create(out_dir.join(MPHF_FILE))?))?; + // Build MPHF from a cloneable parallel iterator — no Vec allocation. + // flat_map_iter: outer chunks in parallel, inner kmer sliding-window sequential. + let keys = (0..unitigs.len()) + .into_par_iter() + .flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw())); + let mphf: DefaultPtrHash = + DefaultPtrHash::new_from_par_iter(n, keys, PtrHashParams::default()); + mphf.store(&out_dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + // Second pass: fill evidence and counts let mut ev = EvidenceWriter::new(n); let mut cnt = CountsWriter::new(n); - for (key, chunk_id, rank) in &entries { - let slot = mphf.get(key).unwrap() as usize; - ev.set(slot, *chunk_id, *rank); - let kmer = CanonicalKmer::from_raw_unchecked(*key); + for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() { + let slot = mphf.index(&kmer.raw()); + ev.set(slot, chunk_id as u32, rank as u8); cnt.set(slot, count_of(kmer)); } diff --git a/src/obilayeredmap/src/tests/layer.rs b/src/obilayeredmap/src/tests/layer.rs index 516062c..4831adf 100644 --- a/src/obilayeredmap/src/tests/layer.rs +++ b/src/obilayeredmap/src/tests/layer.rs @@ -1,5 +1,5 @@ use super::*; -use obikseq::{set_k, Unitig}; +use obikseq::{set_k, Kmer, Sequence as _, Unitig}; use tempfile::tempdir; fn write_unitigs(dir: &Path, seqs: &[&[u8]]) { diff --git a/src/obiskio/src/tests/unitig_index.rs b/src/obiskio/src/tests/unitig_index.rs index e9a8702..4223249 100644 --- a/src/obiskio/src/tests/unitig_index.rs +++ b/src/obiskio/src/tests/unitig_index.rs @@ -137,6 +137,108 @@ fn verify_second_unitig_second_position() { assert!(r.verify_canonical_kmer(1, 1, query)); } +// ── iter_kmers ──────────────────────────────────────────────────────────────── + +#[test] +fn iter_kmers_empty_file() { + set_k(4); + let dir = tempdir().unwrap(); + let path = dir.path().join("unitigs.bin"); + UnitigFileWriter::create(&path).unwrap().close().unwrap(); + let r = UnitigFileReader::open(&path).unwrap(); + assert_eq!(r.iter_kmers().count(), 0); +} + +#[test] +fn iter_kmers_single_chunk_count_and_order() { + set_k(4); + // "AAAACG": 6 nucl → 3 kmers (k=4) + let (_dir, r) = write_read(&[b"AAAACG"]); + let kmers: Vec = r.iter_kmers().collect(); + assert_eq!(kmers.len(), 3); + for (rank, kmer) in kmers.iter().enumerate() { + assert_eq!(kmer.raw(), r.raw_kmer(0, rank), "kmer mismatch at rank {rank}"); + } +} + +#[test] +fn iter_kmers_two_chunks_order() { + set_k(4); + // "AAAACG" → 3 kmers, "CCCCAG" → 3 kmers + let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); + let kmers: Vec = r.iter_kmers().collect(); + assert_eq!(kmers.len(), 6); + // Chunk 0 first + for rank in 0..3 { + assert_eq!(kmers[rank].raw(), r.raw_kmer(0, rank)); + } + // Chunk 1 after + for rank in 0..3 { + assert_eq!(kmers[3 + rank].raw(), r.raw_kmer(1, rank)); + } +} + +// ── iter_canonical_kmers ────────────────────────────────────────────────────── + +#[test] +fn iter_canonical_kmers_all_canonical() { + set_k(4); + let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); + for kmer in r.iter_canonical_kmers() { + // canonical of a canonical kmer is itself + assert_eq!(kmer.raw(), kmer.canonical().raw()); + } +} + +#[test] +fn iter_canonical_kmers_matches_iter_kmers() { + set_k(4); + let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); + let canonical: Vec = r.iter_canonical_kmers().collect(); + let raw: Vec = r.iter_kmers().collect(); + assert_eq!(canonical.len(), raw.len()); + for (ck, rk) in canonical.iter().zip(raw.iter()) { + assert_eq!(ck.raw(), rk.canonical().raw()); + } +} + +// ── iter_indexed_canonical_kmers ────────────────────────────────────────────── + +#[test] +fn iter_indexed_chunk_id_and_rank_single_chunk() { + set_k(4); + let (_dir, r) = write_read(&[b"AAAACG"]); + let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect(); + assert_eq!(items.len(), 3); + for (rank, (kmer, chunk_id, item_rank)) in items.iter().enumerate() { + assert_eq!(*chunk_id, 0, "chunk_id must be 0"); + assert_eq!(*item_rank, rank, "rank mismatch"); + assert!(r.verify_canonical_kmer(0, rank, *kmer)); + } +} + +#[test] +fn iter_indexed_chunk_id_and_rank_two_chunks() { + set_k(4); + let (_dir, r) = write_read(&[b"AAAACG", b"CCCCAG"]); + let items: Vec<(CanonicalKmer, usize, usize)> = r.iter_indexed_canonical_kmers().collect(); + assert_eq!(items.len(), 6); + // First 3 items: chunk_id=0, rank 0..2 + for rank in 0..3 { + let (kmer, chunk_id, item_rank) = items[rank]; + assert_eq!(chunk_id, 0); + assert_eq!(item_rank, rank); + assert!(r.verify_canonical_kmer(0, rank, kmer)); + } + // Next 3 items: chunk_id=1, rank resets to 0 + for rank in 0..3 { + let (kmer, chunk_id, item_rank) = items[3 + rank]; + assert_eq!(chunk_id, 1); + assert_eq!(item_rank, rank); + assert!(r.verify_canonical_kmer(1, rank, kmer)); + } +} + // ── Splitting ───────────────────────────────────────────────────────────────── #[test] diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index 5f47c5e..479740b 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -3,7 +3,7 @@ use std::io::{BufWriter, Write as _}; use std::path::{Path, PathBuf}; use memmap2::Mmap; -use obikseq::{CanonicalKmer, Unitig}; +use obikseq::{CanonicalKmer, Kmer, Unitig}; pub use obikseq::MAX_KMERS_PER_CHUNK; @@ -13,6 +13,7 @@ use crate::error::{SKError, SKResult}; // // magic: [u8; 4] = b"UIDX" // n_unitigs: u32 LE +// n_kmers: u64 LE total kmer count across all chunks // seqls: [u8; n_unitigs] max kmer index per chunk (= n_kmers − 1) // packed_offsets: [u32; n_unitigs + 1] byte offsets to packed bytes in the // sequence file; last entry is sentinel @@ -44,6 +45,7 @@ pub struct UnitigFileWriter { seqls: Vec, packed_offsets: Vec, next_offset: u32, + n_kmers: usize, k: usize, } @@ -56,6 +58,7 @@ impl UnitigFileWriter { seqls: Vec::new(), packed_offsets: Vec::new(), next_offset: 0, + n_kmers: 0, k: obikseq::params::k(), }) } @@ -98,6 +101,7 @@ impl UnitigFileWriter { debug_assert!(seql - self.k <= u8::MAX as usize, "chunk exceeds MAX_KMERS_PER_CHUNK"); self.packed_offsets.push(self.next_offset + 1); self.seqls.push((seql - self.k) as u8); + self.n_kmers += seql - self.k + 1; unitig .write_to_binary(&mut self.file) @@ -122,7 +126,7 @@ impl UnitigFileWriter { }; self.packed_offsets.push(sentinel); - write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets) + write_idx(&idx_path(&self.path), &self.seqls, &self.packed_offsets, self.n_kmers) } pub fn len(&self) -> usize { @@ -134,10 +138,11 @@ impl UnitigFileWriter { } } -fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32]) -> SKResult<()> { +fn write_idx(path: &Path, seqls: &[u8], packed_offsets: &[u32], n_kmers: usize) -> SKResult<()> { let mut w = BufWriter::new(File::create(path).map_err(SKError::Io)?); w.write_all(&MAGIC).map_err(SKError::Io)?; w.write_all(&(seqls.len() as u32).to_le_bytes()).map_err(SKError::Io)?; + w.write_all(&(n_kmers as u64).to_le_bytes()).map_err(SKError::Io)?; w.write_all(seqls).map_err(SKError::Io)?; for &off in packed_offsets { w.write_all(&off.to_le_bytes()).map_err(SKError::Io)?; @@ -155,6 +160,7 @@ pub struct UnitigFileReader { mmap: Mmap, seqls: Vec, packed_offsets: Vec, + n_kmers: usize, k: usize, } @@ -162,9 +168,9 @@ impl UnitigFileReader { pub fn open(path: &Path) -> SKResult { let file = File::open(path).map_err(SKError::Io)?; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; - let (seqls, packed_offsets) = read_idx(&idx_path(path))?; + let (seqls, packed_offsets, n_kmers) = read_idx(&idx_path(path))?; let k = obikseq::params::k(); - Ok(Self { mmap, seqls, packed_offsets, k }) + Ok(Self { mmap, seqls, packed_offsets, n_kmers, k }) } pub fn len(&self) -> usize { @@ -175,6 +181,11 @@ impl UnitigFileReader { self.seqls.is_empty() } + /// Total number of kmers across all chunks. + pub fn n_kmers(&self) -> usize { + self.n_kmers + } + /// Return the nucleotide length of chunk `i`. #[inline] pub fn seql(&self, i: usize) -> usize { @@ -206,9 +217,42 @@ impl UnitigFileReader { pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool { canonical_raw(self.raw_kmer(i, j), self.k) == query.raw() } + + /// Iterate over all kmers in file order (all positions of chunk 0, then chunk 1, …). + /// + /// Each chunk is copied from the mmap once; iteration within the chunk is + /// zero-allocation (sliding-window via [`OwnedPackedSeqKmerIter`]). + pub fn iter_kmers(&self) -> impl Iterator + '_ { + (0..self.len()).flat_map(move |i| self.unitig(i).into_kmers()) + } + + /// Iterate over all canonical kmers in file order. + /// + /// Equivalent to `iter_kmers().map(|km| km.canonical())` but uses the + /// built-in canonical iterator on each chunk, which avoids a separate + /// canonicalization pass. + pub fn iter_canonical_kmers(&self) -> impl Iterator + '_ { + (0..self.len()).flat_map(move |i| self.unitig(i).into_canonical_kmers()) + } + + /// Iterate over `(kmer, chunk_id, rank)` for every canonical kmer in the file. + /// + /// `chunk_id` is the index of the chunk within this file; `rank` is the + /// 0-based position of the kmer within that chunk. Used to build the + /// evidence table in `obilayeredmap`. + pub fn iter_indexed_canonical_kmers( + &self, + ) -> impl Iterator + '_ { + (0..self.len()).flat_map(move |chunk_id| { + self.unitig(chunk_id) + .into_canonical_kmers() + .enumerate() + .map(move |(rank, kmer)| (kmer, chunk_id, rank)) + }) + } } -fn read_idx(path: &Path) -> SKResult<(Vec, Vec)> { +fn read_idx(path: &Path) -> SKResult<(Vec, Vec, usize)> { let data = std::fs::read(path).map_err(SKError::Io)?; let mut pos = 0; @@ -227,6 +271,11 @@ fn read_idx(path: &Path) -> SKResult<(Vec, Vec)> { let n = u32::from_le_bytes(n_bytes.try_into().unwrap()) as usize; pos += 4; + let nk_bytes = data.get(pos..pos + 8) + .ok_or(SKError::Truncated { context: "unitig index: n_kmers" })?; + let n_kmers = u64::from_le_bytes(nk_bytes.try_into().unwrap()) as usize; + pos += 8; + let seqls = data.get(pos..pos + n) .ok_or(SKError::Truncated { context: "unitig index: seqls" })? .to_vec(); @@ -240,7 +289,7 @@ fn read_idx(path: &Path) -> SKResult<(Vec, Vec)> { pos += 4; } - Ok((seqls, packed_offsets)) + Ok((seqls, packed_offsets, n_kmers)) } // ── Kmer utilities ────────────────────────────────────────────────────────────