feat: add approximate evidence matching and index estimation CLI

Introduces a new `estimate` CLI subcommand to calculate bloom filter size, evidence bits, and false-positive rates for approximate indexing. Updates the index building and querying pipelines to support both exact and approximate evidence types via a unified `EvidenceKind` abstraction. Refactors `MphfLayer` and partition index builders to route operations based on the selected evidence mode, and adds the required `obilayeredmap` dependency.
This commit is contained in:
Eric Coissac
2026-05-23 12:43:32 +02:00
parent 16a6b0d033
commit 876bc0127f
11 changed files with 243 additions and 47 deletions
+7
View File
@@ -10,6 +10,7 @@ use obikseq::CanonicalKmer;
use obiskio::{UnitigFileReader, UnitigFileWriter};
use crate::error::{OLMError, OLMResult};
use crate::meta::EvidenceKind;
use crate::mphf_layer::MphfLayer;
pub(crate) use crate::mphf_layer::UNITIGS_FILE;
@@ -93,6 +94,12 @@ impl<D: LayerData> Layer<D> {
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
MphfLayer::build_approx_evidence(layer_dir, b, z)
}
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
/// `kind`.
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
MphfLayer::build_evidence(layer_dir, kind)
}
}
// ── Mode 1 — set membership ───────────────────────────────────────────────────
+3 -2
View File
@@ -17,9 +17,10 @@ pub enum EvidenceKind {
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
Exact,
/// Approximate evidence: `fingerprint.bin` only.
/// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
/// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b.
/// `z` — consecutive k-mers that must all match (Findere trick);
/// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
/// effective FP rate per sequencing read ≈ W / 2^(b·z)
/// where W = L - k - z + 2 is the number of windows in a read of length L.
Approx { b: u8, z: u8 },
}
+75 -43
View File
@@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter};
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
use crate::meta::LayerMeta;
use crate::meta::{EvidenceKind, LayerMeta};
pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
@@ -19,80 +19,117 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
// ── Evidence store ────────────────────────────────────────────────────────────
enum LayerEvidence {
Exact { evidence: Evidence, unitigs: UnitigFileReader },
Approx { fingerprint: FingerprintVec },
}
// ── MphfLayer ─────────────────────────────────────────────────────────────────
/// Autonomous kmer → slot mapping for one layer.
///
/// Answers presence/absence queries without any attached DataStore.
/// Build once, never rebuilt — data stores are attached and derived externally.
/// Dispatches queries to exact or approximate evidence transparently based on
/// the `layer_meta.json` written at build time.
pub struct MphfLayer {
mphf: Mphf,
evidence: Evidence,
unitigs: UnitigFileReader,
n: usize,
mphf: Mphf,
ev: LayerEvidence,
n: usize,
}
impl MphfLayer {
pub fn open(dir: &Path) -> OLMResult<Self> {
let meta = LayerMeta::load(dir)?;
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
let n = evidence.len();
Ok(Self { mphf, evidence, unitigs, n })
let (ev, n) = match meta.evidence {
EvidenceKind::Exact => {
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
let n = evidence.len();
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
(LayerEvidence::Exact { evidence, unitigs }, n)
}
EvidenceKind::Approx { .. } => {
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
let n = fingerprint.n();
(LayerEvidence::Approx { fingerprint }, n)
}
};
Ok(Self { mphf, ev, n })
}
/// Returns `Some(slot)` if `kmer` belongs to this layer, `None` otherwise.
// ── Query API ─────────────────────────────────────────────────────────────
/// Transparent dispatch: routes to `find_exact` or `find_approx` based on
/// the evidence loaded at `open` time.
#[inline]
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
match &self.ev {
LayerEvidence::Exact { .. } => self.find_exact(kmer),
LayerEvidence::Approx { .. } => self.find_approx(kmer),
}
}
/// Exact lookup: zero false positives. Panics if the layer was opened with
/// approximate evidence.
#[inline]
pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> {
let LayerEvidence::Exact { evidence, unitigs } = &self.ev else {
panic!("find_exact called on an approximate layer");
};
let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; }
let (chunk_id, rank) = self.evidence.decode(slot);
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
let (chunk_id, rank) = evidence.decode(slot);
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
Some(slot)
} else {
None
}
}
/// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise.
///
/// False positive rate per query: 1/2^b (where b is the fingerprint width
/// used at build time). No `.idx` or `evidence.bin` needed at query time.
/// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics
/// if the layer was opened with exact evidence.
#[inline]
pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option<usize> {
pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
let LayerEvidence::Approx { fingerprint } = &self.ev else {
panic!("find_approx called on an exact layer");
};
let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; }
if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
}
pub fn n(&self) -> usize { self.n }
// ── Build helpers ─────────────────────────────────────────────────────────
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
fs::create_dir_all(dir)?;
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
}
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
/// `mphf.bin` already present in `dir`.
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
/// `kind`.
pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
match kind {
EvidenceKind::Exact => Self::build_exact_evidence(dir),
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
}
}
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
///
/// This is the exact-evidence construction route. It can be called:
/// - after the initial build (via [`Self::build`] which calls it internally)
/// - standalone to promote an existing (unitigs + mphf) into an exact index
/// - standalone to rebuild evidence after a format change
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
/// `mphf.bin` already present in `dir`.
///
/// Uses sequential iteration — no `.idx` is required on entry.
/// Writes both `evidence.bin` and `unitigs.bin.idx` on success.
/// Uses sequential iteration — no `.idx` required on entry.
pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
let unitig_path = dir.join(UNITIGS_FILE);
// Sequential scan — no .idx required for iteration
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
let n = unitigs.n_kmers();
if n == 0 {
fs::File::create(dir.join(EVIDENCE_FILE))?;
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
LayerMeta::exact().save(dir)?;
return Ok(0);
}
@@ -117,18 +154,14 @@ impl MphfLayer {
}
ev.write(&dir.join(EVIDENCE_FILE))?;
// Write .idx last: it is only needed for random access (queries).
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
LayerMeta::exact().save(dir)?;
Ok(n)
}
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
/// in `dir`. `b` — fingerprint bits (1..=64); `z` — Findere consecutive
/// k-mer parameter (≥1).
///
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
/// No `.idx` file is written — approximate evidence needs no random access.
/// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
/// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
/// parameter (≥1). No `.idx` is written.
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
@@ -175,8 +208,6 @@ impl MphfLayer {
let unitig_path = dir.join(UNITIGS_FILE);
// Write .idx so that UnitigFileReader::open succeeds and parallel
// random access is available for MPHF construction.
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
let unitigs = UnitigFileReader::open(&unitig_path)?;
@@ -189,10 +220,11 @@ impl MphfLayer {
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
LayerMeta::exact().save(dir)?;
return Ok(0);
}
// Pass 1 — build MPHF (parallel, uses random access via .idx)
// Pass 1 — build MPHF (parallel, random access via .idx)
let keys = (0..unitigs.len())
.into_par_iter()
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));