feat: support exact and approximate evidence in layer construction
Refactored `MphfLayer::build` to accept an `EvidenceKind` parameter, routing to exact (index-based, parallel MPHF, writes `evidence.bin`) or approximate (sequential mmap iterator, writes `fingerprint.bin`) pipelines. Introduced `CanonicalKmerIter` for memory-mapped, chunked k-mer iteration with O(1) resets via `Arc<Mmap>`. Updated layer and map APIs to forward evidence kind, added `push_layer` for count matrices, and adjusted tests and public exports accordingly.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use memmap2::Mmap;
|
||||
use obikseq::{CanonicalKmer, Kmer, Unitig};
|
||||
@@ -439,6 +440,85 @@ fn extract_kmer_raw(bytes: &[u8], j: usize, k: usize) -> u64 {
|
||||
raw << (64 - 2 * k)
|
||||
}
|
||||
|
||||
// ── CanonicalKmerRawIter ──────────────────────────────────────────────────────
|
||||
|
||||
// ── CanonicalKmerIter ─────────────────────────────────────────────────────────
|
||||
|
||||
/// Sequential iterator over [`CanonicalKmer`] from a `unitigs.bin` file.
|
||||
///
|
||||
/// Holds an `Arc<Mmap>` so that `Clone` is O(1): both copies share the same
|
||||
/// memory-mapped pages. Cloning resets the cursor to position 0 — this lets
|
||||
/// ptr_hash's `new_from_par_iter` (which requires a `Clone`-able parallel
|
||||
/// iterator via `par_bridge()`) make multiple passes without ever creating
|
||||
/// a `.idx` file.
|
||||
pub struct CanonicalKmerIter {
|
||||
mmap: Arc<Mmap>,
|
||||
k: usize,
|
||||
chunk_pos: usize, // byte offset of the current chunk header
|
||||
data_pos: usize, // byte offset of the current chunk's sequence bytes
|
||||
n_kmers: usize, // kmers in current chunk
|
||||
kmer_idx: usize, // next kmer index to yield within the current chunk
|
||||
}
|
||||
|
||||
impl CanonicalKmerIter {
|
||||
pub fn new(path: &Path) -> SKResult<Self> {
|
||||
let file = File::open(path).map_err(SKError::Io)?;
|
||||
let mmap = Arc::new(unsafe { Mmap::map(&file).map_err(SKError::Io)? });
|
||||
let k = obikseq::params::k();
|
||||
let mut s = Self { mmap, k, chunk_pos: 0, data_pos: 0, n_kmers: 0, kmer_idx: 0 };
|
||||
s.load_chunk();
|
||||
Ok(s)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn load_chunk(&mut self) {
|
||||
if self.chunk_pos < self.mmap.len() {
|
||||
let seql_minus_k = self.mmap[self.chunk_pos] as usize;
|
||||
self.n_kmers = seql_minus_k + 1;
|
||||
self.data_pos = self.chunk_pos + 1;
|
||||
self.kmer_idx = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for CanonicalKmerIter {
|
||||
fn clone(&self) -> Self {
|
||||
let mut c = Self {
|
||||
mmap: Arc::clone(&self.mmap),
|
||||
k: self.k,
|
||||
chunk_pos: 0,
|
||||
data_pos: 0,
|
||||
n_kmers: 0,
|
||||
kmer_idx: 0,
|
||||
};
|
||||
c.load_chunk();
|
||||
c
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for CanonicalKmerIter {
|
||||
type Item = CanonicalKmer;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<CanonicalKmer> {
|
||||
loop {
|
||||
if self.chunk_pos >= self.mmap.len() {
|
||||
return None;
|
||||
}
|
||||
if self.kmer_idx < self.n_kmers {
|
||||
let raw = extract_kmer_raw(&self.mmap[self.data_pos..], self.kmer_idx, self.k);
|
||||
let canon = canonical_raw(raw, self.k);
|
||||
self.kmer_idx += 1;
|
||||
return Some(CanonicalKmer::from_raw_unchecked(canon));
|
||||
}
|
||||
let seql_minus_k = self.mmap[self.chunk_pos] as usize;
|
||||
let byte_len = (seql_minus_k + self.k + 3) / 4;
|
||||
self.chunk_pos += 1 + byte_len;
|
||||
self.load_chunk();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/unitig_index.rs"]
|
||||
mod tests;
|
||||
|
||||
Reference in New Issue
Block a user