feat: support exact and approximate evidence in layer construction

Refactored `MphfLayer::build` to accept an `EvidenceKind` parameter, routing to exact (index-based, parallel MPHF, writes `evidence.bin`) or approximate (sequential mmap iterator, writes `fingerprint.bin`) pipelines. Introduced `CanonicalKmerIter` for memory-mapped, chunked k-mer iteration with O(1) resets via `Arc<Mmap>`. Updated layer and map APIs to forward evidence kind, added `push_layer` for count matrices, and adjusted tests and public exports accordingly.
This commit is contained in:
Eric Coissac
2026-05-26 09:41:13 +02:00
parent 036d044291
commit 9d46400898
9 changed files with 215 additions and 68 deletions
+2 -1
View File
@@ -8,7 +8,8 @@ pub use error::{SKError, SKResult};
pub use meta::SKFileMeta;
pub use pool::{SKFilePool, SKFileWriter, SharedPool, create_token, create_token_with};
pub use reader::{SKFileIter, SKFileReader};
pub use unitig_index::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS};
pub use unitig_index::{UnitigFileReader, UnitigFileWriter, build_unitig_idx, DEFAULT_BLOCK_BITS,
CanonicalKmerIter};
use std::path::{Path, PathBuf};
+80
View File
@@ -1,6 +1,7 @@
use std::fs::File;
use std::io::{BufWriter, Write as _};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use memmap2::Mmap;
use obikseq::{CanonicalKmer, Kmer, Unitig};
@@ -439,6 +440,85 @@ fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
raw << (64 - 2 * k)
}
// ── CanonicalKmerRawIter ──────────────────────────────────────────────────────
// ── CanonicalKmerIter ─────────────────────────────────────────────────────────
/// Sequential iterator over [`CanonicalKmer`] from a `unitigs.bin` file.
///
/// Holds an `Arc<Mmap>` so that `Clone` is O(1): both copies share the same
/// memory-mapped pages. Cloning resets the cursor to position 0 — this lets
/// ptr_hash's `new_from_par_iter` (which requires a `Clone`-able parallel
/// iterator via `par_bridge()`) make multiple passes without ever creating
/// a `.idx` file.
pub struct CanonicalKmerIter {
mmap: Arc<Mmap>,
k: usize,
chunk_pos: usize, // byte offset of the current chunk header
data_pos: usize, // byte offset of the current chunk's sequence bytes
n_kmers: usize, // kmers in current chunk
kmer_idx: usize, // next kmer index to yield within the current chunk
}
impl CanonicalKmerIter {
pub fn new(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = Arc::new(unsafe { Mmap::map(&file).map_err(SKError::Io)? });
let k = obikseq::params::k();
let mut s = Self { mmap, k, chunk_pos: 0, data_pos: 0, n_kmers: 0, kmer_idx: 0 };
s.load_chunk();
Ok(s)
}
#[inline]
fn load_chunk(&mut self) {
if self.chunk_pos < self.mmap.len() {
let seql_minus_k = self.mmap[self.chunk_pos] as usize;
self.n_kmers = seql_minus_k + 1;
self.data_pos = self.chunk_pos + 1;
self.kmer_idx = 0;
}
}
}
impl Clone for CanonicalKmerIter {
fn clone(&self) -> Self {
let mut c = Self {
mmap: Arc::clone(&self.mmap),
k: self.k,
chunk_pos: 0,
data_pos: 0,
n_kmers: 0,
kmer_idx: 0,
};
c.load_chunk();
c
}
}
impl Iterator for CanonicalKmerIter {
type Item = CanonicalKmer;
#[inline]
fn next(&mut self) -> Option<CanonicalKmer> {
loop {
if self.chunk_pos >= self.mmap.len() {
return None;
}
if self.kmer_idx < self.n_kmers {
let raw = extract_kmer_raw(&self.mmap[self.data_pos..], self.kmer_idx, self.k);
let canon = canonical_raw(raw, self.k);
self.kmer_idx += 1;
return Some(CanonicalKmer::from_raw_unchecked(canon));
}
let seql_minus_k = self.mmap[self.chunk_pos] as usize;
let byte_len = (seql_minus_k + self.k + 3) / 4;
self.chunk_pos += 1 + byte_len;
self.load_chunk();
}
}
}
#[cfg(test)]
#[path = "tests/unitig_index.rs"]
mod tests;