9d46400898
Refactored `MphfLayer::build` to accept an `EvidenceKind` parameter, routing to exact (index-based, parallel MPHF, writes `evidence.bin`) or approximate (sequential mmap iterator, writes `fingerprint.bin`) pipelines. Introduced `CanonicalKmerIter` for memory-mapped, chunked k-mer iteration with O(1) resets via `Arc<Mmap>`. Updated layer and map APIs to forward evidence kind, added `push_layer` for count matrices, and adjusted tests and public exports accordingly.
329 lines
14 KiB
Rust
329 lines
14 KiB
Rust
use std::fs;
|
|
use std::path::Path;
|
|
|
|
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
|
use epserde::prelude::*;
|
|
use obikseq::CanonicalKmer;
|
|
use obiskio::{CanonicalKmerIter, UnitigFileReader, UnitigFileWriter, build_unitig_idx};
|
|
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
|
|
|
use crate::error::{OLMError, OLMResult};
|
|
use crate::evidence::{Evidence, EvidenceWriter};
|
|
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
|
use crate::meta::{EvidenceKind, LayerMeta};
|
|
|
|
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
|
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
|
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
|
|
pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
|
|
|
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
|
|
|
// ── Evidence store ────────────────────────────────────────────────────────────
|
|
|
|
enum LayerEvidence {
|
|
Exact { evidence: Evidence, unitigs: UnitigFileReader },
|
|
Approx { fingerprint: FingerprintVec },
|
|
}
|
|
|
|
// ── MphfLayer ─────────────────────────────────────────────────────────────────
|
|
|
|
/// Autonomous kmer → slot mapping for one layer.
|
|
///
|
|
/// Dispatches queries to exact or approximate evidence transparently based on
|
|
/// the `layer_meta.json` written at build time.
|
|
pub struct MphfLayer {
|
|
mphf: Mphf,
|
|
ev: LayerEvidence,
|
|
n: usize,
|
|
}
|
|
|
|
impl MphfLayer {
|
|
pub fn open(dir: &Path) -> OLMResult<Self> {
|
|
let meta = LayerMeta::load(dir)?;
|
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
let (ev, n) = match meta.evidence {
|
|
EvidenceKind::Exact => {
|
|
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
|
|
let n = evidence.len();
|
|
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
|
|
(LayerEvidence::Exact { evidence, unitigs }, n)
|
|
}
|
|
EvidenceKind::Approx { .. } => {
|
|
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
|
|
let n = fingerprint.n();
|
|
(LayerEvidence::Approx { fingerprint }, n)
|
|
}
|
|
};
|
|
Ok(Self { mphf, ev, n })
|
|
}
|
|
|
|
// ── Query API ─────────────────────────────────────────────────────────────
|
|
|
|
/// Transparent dispatch: routes to `find_exact` or `find_approx` based on
|
|
/// the evidence loaded at `open` time.
|
|
#[inline]
|
|
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
|
match &self.ev {
|
|
LayerEvidence::Exact { .. } => self.find_exact(kmer),
|
|
LayerEvidence::Approx { .. } => self.find_approx(kmer),
|
|
}
|
|
}
|
|
|
|
/// Exact lookup: zero false positives. Panics if the layer was opened with
|
|
/// approximate evidence.
|
|
#[inline]
|
|
pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> {
|
|
let LayerEvidence::Exact { evidence, unitigs } = &self.ev else {
|
|
panic!("find_exact called on an approximate layer");
|
|
};
|
|
let slot = self.mphf.index(&kmer.raw());
|
|
if slot >= self.n { return None; }
|
|
let (chunk_id, rank) = evidence.decode(slot);
|
|
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
|
Some(slot)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics
|
|
/// if the layer was opened with exact evidence.
|
|
#[inline]
|
|
pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
|
|
let LayerEvidence::Approx { fingerprint } = &self.ev else {
|
|
panic!("find_approx called on an exact layer");
|
|
};
|
|
let slot = self.mphf.index(&kmer.raw());
|
|
if slot >= self.n { return None; }
|
|
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
|
}
|
|
|
|
pub fn n(&self) -> usize { self.n }
|
|
|
|
// ── Build helpers ─────────────────────────────────────────────────────────
|
|
|
|
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
|
fs::create_dir_all(dir)?;
|
|
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
|
|
}
|
|
|
|
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
|
/// `kind`. `block_bits` is forwarded to exact evidence only.
|
|
pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
|
match kind {
|
|
EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits),
|
|
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
|
}
|
|
}
|
|
|
|
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
|
///
|
|
/// `block_bits` controls the `.idx` block size (2^block_bits chunks per block).
|
|
/// Uses sequential iteration — no `.idx` required on entry.
|
|
pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
|
let unitig_path = dir.join(UNITIGS_FILE);
|
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
|
let n = unitigs.n_kmers();
|
|
|
|
if n == 0 {
|
|
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
|
build_unitig_idx(&unitig_path, block_bits)?;
|
|
LayerMeta::exact().save(dir)?;
|
|
return Ok(0);
|
|
}
|
|
|
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
|
|
let mut ev = EvidenceWriter::new(n);
|
|
let mut seen = vec![0u8; (n + 7) / 8];
|
|
|
|
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
|
|
let slot = mphf.index(&kmer.raw());
|
|
if slot >= n {
|
|
return Err(OLMError::Mphf("slot out of bounds".into()));
|
|
}
|
|
let byte = slot / 8;
|
|
let bit = 1u8 << (slot % 8);
|
|
if seen[byte] & bit != 0 {
|
|
return Err(OLMError::Mphf("duplicate slot".into()));
|
|
}
|
|
seen[byte] |= bit;
|
|
ev.set(slot, chunk_id as u32, rank as u8);
|
|
}
|
|
|
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
|
build_unitig_idx(&unitig_path, block_bits)?;
|
|
LayerMeta::exact().save(dir)?;
|
|
Ok(n)
|
|
}
|
|
|
|
/// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
|
|
/// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
|
|
/// parameter (≥1). No `.idx` is written.
|
|
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
|
if b == 0 || b > 64 {
|
|
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
|
}
|
|
if z == 0 {
|
|
return Err(OLMError::InvalidLayer("z must be ≥ 1".into()));
|
|
}
|
|
let unitig_path = dir.join(UNITIGS_FILE);
|
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
|
let n = unitigs.n_kmers();
|
|
|
|
if n == 0 {
|
|
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
|
|
LayerMeta::approx(b, z).save(dir)?;
|
|
return Ok(0);
|
|
}
|
|
|
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
|
|
let mut fw = FingerprintVecWriter::new(n, b);
|
|
|
|
for kmer in unitigs.iter_canonical_kmers() {
|
|
let slot = mphf.index(&kmer.raw());
|
|
if slot >= n {
|
|
return Err(OLMError::Mphf("slot out of bounds".into()));
|
|
}
|
|
fw.set(slot, kmer.seq_hash());
|
|
}
|
|
|
|
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
|
LayerMeta::approx(b, z).save(dir)?;
|
|
Ok(n)
|
|
}
|
|
|
|
/// Build MPHF then evidence from the unitigs file already present in `dir`.
|
|
///
|
|
/// - Exact: `.idx` is built for pass-1 parallel construction and kept for
|
|
/// query-time kmer verification. `evidence.bin` is written.
|
|
/// - Approx: pass-1 uses `open_sequential` + `par_bridge` — no `.idx` is
|
|
/// ever created. `fingerprint.bin` is written.
|
|
///
|
|
/// `fill_slot(slot, kmer)` is called once per kmer in both modes.
|
|
pub(crate) fn build(
|
|
dir: &Path,
|
|
block_bits: u8,
|
|
evidence_kind: &EvidenceKind,
|
|
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
|
) -> OLMResult<usize> {
|
|
use rayon::prelude::*;
|
|
|
|
let unitig_path = dir.join(UNITIGS_FILE);
|
|
|
|
match evidence_kind {
|
|
// ── Exact path ────────────────────────────────────────────────────
|
|
EvidenceKind::Exact => {
|
|
build_unitig_idx(&unitig_path, block_bits)?;
|
|
|
|
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
|
let n = unitigs.n_kmers();
|
|
|
|
if n == 0 {
|
|
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
|
let mphf: Mphf =
|
|
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
|
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
|
mphf.store(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
LayerMeta::exact().save(dir)?;
|
|
return Ok(0);
|
|
}
|
|
|
|
// Pass 1 — parallel MPHF via random access (.idx required)
|
|
let keys = (0..unitigs.len())
|
|
.into_par_iter()
|
|
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
|
let mphf: Mphf =
|
|
Mphf::new_from_par_iter(n, keys, PtrHashParams::<CubicEps>::default());
|
|
mphf.store(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
|
|
// Pass 2 — sequential: fill evidence.bin + callback
|
|
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
|
|
let mut ev = EvidenceWriter::new(n);
|
|
let mut seen = vec![0u8; (n + 7) / 8];
|
|
|
|
for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() {
|
|
let slot = mphf.index(&kmer.raw());
|
|
if slot >= n {
|
|
return Err(OLMError::Mphf("slot out of bounds".into()));
|
|
}
|
|
let byte = slot / 8;
|
|
let bit = 1u8 << (slot % 8);
|
|
if seen[byte] & bit != 0 {
|
|
return Err(OLMError::Mphf("duplicate slot".into()));
|
|
}
|
|
seen[byte] |= bit;
|
|
ev.set(slot, chunk_id as u32, rank as u8);
|
|
fill_slot(slot, kmer)?;
|
|
}
|
|
|
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
|
LayerMeta::exact().save(dir)?;
|
|
Ok(n)
|
|
}
|
|
|
|
// ── Approx path ───────────────────────────────────────────────────
|
|
// No .idx is created at any point.
|
|
EvidenceKind::Approx { b, z } => {
|
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
|
let n = unitigs.n_kmers();
|
|
|
|
if n == 0 {
|
|
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
|
|
let mphf: Mphf =
|
|
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
|
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
|
mphf.store(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
LayerMeta::approx(*b, *z).save(dir)?;
|
|
return Ok(0);
|
|
}
|
|
|
|
// Pass 1 — MPHF construction via mmap-backed clonable iterator.
|
|
// No .idx is created. par_bridge() parallelises the sequential scan;
|
|
// Clone on CanonicalKmerRawIter shares the Arc<Mmap> and resets to pos 0.
|
|
let keys = CanonicalKmerIter::new(&unitig_path)
|
|
.map_err(|e| match e {
|
|
obiskio::SKError::Io(io) => OLMError::Io(io),
|
|
e => OLMError::InvalidLayer(e.to_string()),
|
|
})?;
|
|
let mphf: Mphf =
|
|
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::<CubicEps>::default());
|
|
mphf.store(&dir.join(MPHF_FILE))
|
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
|
|
|
// Pass 2 — sequential: fill fingerprint.bin + callback
|
|
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
|
|
let mut fw = FingerprintVecWriter::new(n, *b);
|
|
let mut seen = vec![0u8; (n + 7) / 8];
|
|
|
|
for kmer in unitigs2.iter_canonical_kmers() {
|
|
let slot = mphf.index(&kmer.raw());
|
|
if slot >= n {
|
|
return Err(OLMError::Mphf("slot out of bounds".into()));
|
|
}
|
|
let byte = slot / 8;
|
|
let bit = 1u8 << (slot % 8);
|
|
if seen[byte] & bit != 0 {
|
|
return Err(OLMError::Mphf("duplicate slot".into()));
|
|
}
|
|
seen[byte] |= bit;
|
|
fw.set(slot, kmer.seq_hash());
|
|
fill_slot(slot, kmer)?;
|
|
}
|
|
|
|
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
|
LayerMeta::approx(*b, *z).save(dir)?;
|
|
Ok(n)
|
|
}
|
|
}
|
|
}
|
|
}
|