feat: add approximate evidence matching and index estimation CLI

Introduces a new `estimate` CLI subcommand to calculate bloom filter size, evidence bits, and false-positive rates for approximate indexing. Updates the index building and querying pipelines to support both exact and approximate evidence types via a unified `EvidenceKind` abstraction. Refactors `MphfLayer` and partition index builders to route operations based on the selected evidence mode, and adds the required `obilayeredmap` dependency.
2026-05-23 12:43:32 +02:00
parent 16a6b0d033
commit 876bc0127f
11 changed files with 243 additions and 47 deletions
@@ -1529,6 +1529,7 @@ dependencies = [
 "obikpartitionner",
 "obikrope",
 "obikseq",
 "obilayeredmap",
 "obipipeline",
 "obiread",
 "obiskbuilder",
@@ -144,6 +144,7 @@ impl KmerIndex {
        let n = self.partition.n_partitions();
        let t = Stage::start("index");
        let with_counts = self.meta.config.with_counts;
        let evidence = self.meta.config.evidence.clone();
        let total_kmers = AtomicUsize::new(0);
        let pb = Arc::new(Mutex::new(
@@ -153,7 +154,7 @@ impl KmerIndex {
        ));
        (0..n).into_par_iter().for_each(|i| {
-            match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) {
+            match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) {
                Ok(0) => {}
                Ok(n_kmers) => {
                    total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
@@ -18,6 +18,7 @@ obikpartitionner = { path = "../obikpartitionner" }
 obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
 obikindex     = { path = "../obikindex" }
 obilayeredmap = { path = "../obilayeredmap" }
 clap          = { version = "4", features = ["derive"] }
 serde_json    = "1"
 csv           = "1"
@@ -0,0 +1,38 @@
 use clap::Args;
 use super::index::resolve_approx_params;
 #[derive(Args)]
 pub struct EstimateArgs {
    /// k-mer size used for querying (same as --kmer-size in index)
    #[arg(short = 'k', long, default_value_t = 31)]
    pub kmer_size: usize,
    /// Findere z parameter: number of consecutive k-mers that must all match.
    /// Effective indexed k-mer size is kmer_size - z + 1.
    #[arg(short = 'z', long, default_value = None)]
    pub findere_z: Option<u8>,
    /// Fingerprint bits per slot (b).  FP per z-window = 1/2^(b·z).
    #[arg(long, default_value = None)]
    pub evidence_bits: Option<u8>,
    /// Target false-positive rate per z-window (e.g. 0.01).
    #[arg(long, default_value = None)]
    pub fp: Option<f64>,
 }
 pub fn run(args: EstimateArgs) {
    let (z, b, fp_window) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
    let k_query = args.kmer_size;
    let k_index = k_query.saturating_sub(z as usize - 1);
    let fp_kmer = 1.0_f64 / 2_f64.powi(b as i32);
    println!("{:<22} {}", "k (query):",        k_query);
    println!("{:<22} {}", "k (indexed):",      k_index);
    println!("{:<22} {}", "z:",                z);
    println!("{:<22} {}", "evidence bits (b):", b);
    println!("{:<22} {:.3e}  (1/2^{})",        "FP per k-mer:",    fp_kmer,   b);
    println!("{:<22} {:.3e}  (1/2^{})",        "FP per z-window:", fp_window, b as u32 * z as u32);
 }
@@ -2,6 +2,7 @@ use std::path::PathBuf;
 use clap::Args;
 use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
 use obilayeredmap::EvidenceKind;
 fn parse_key_value(s: &str) -> Result<(String, String), String> {
    let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?;
@@ -48,14 +49,116 @@ pub struct IndexArgs {
    #[arg(long, default_value_t = false)]
    pub keep_intermediate: bool,
    /// Use approximate (fingerprint-based) evidence instead of exact evidence.
    /// False-positive rate per z-window: 1/2^(b·z).
    #[arg(long, default_value_t = false)]
    pub approx: bool,
    /// Findere z parameter: number of consecutive k-mers that must all match.
    /// Effective indexed k-mer size is kmer_size - z + 1.
    #[arg(short = 'z', long, default_value = None)]
    pub findere_z: Option<u8>,
    /// Fingerprint bits per slot (b).  FP per z-window = 1/2^(b·z).
    #[arg(long, default_value = None)]
    pub evidence_bits: Option<u8>,
    /// Target false-positive rate per z-window (e.g. 0.01).
    /// Used to derive missing b or z.
    #[arg(long, default_value = None)]
    pub fp: Option<f64>,
    #[command(flatten)]
    pub common: CommonArgs,
 }
 /// Resolve the (z, b, fp) triplet from the user-supplied subset.
 ///
 /// Model: FP = 1/2^(b·z)  ⟹  b·z = ⌈-log₂(fp)⌉
 ///
 /// Rules when one value is missing (conservative = ceiling):
 ///   given z, b       → fp = 1/2^(b·z)
 ///   given z, fp      → b  = ⌈-log₂(fp) / z⌉
 ///   given b, fp      → z  = ⌈-log₂(fp) / b⌉
 ///   given z only     → b  = 8  (default),  fp derived
 ///   given b only     → z  = 1  (default),  fp derived
 ///   given fp only    → b  = 8  (default),  z  derived
 ///   none given       → z  = 1, b = 8, fp = 1/256
 pub(crate) fn resolve_approx_params(
    z_opt: Option<u8>,
    b_opt: Option<u8>,
    fp_opt: Option<f64>,
 ) -> (u8, u8, f64) {
    const DEFAULT_B: u8 = 8;
    const DEFAULT_Z: u8 = 1;
    let bits_needed = |fp: f64| -> u8 {
        (-fp.log2()).ceil() as u8
    };
    match (z_opt, b_opt, fp_opt) {
        // All three given: use b and z, recompute fp conservatively.
        (Some(z), Some(b), Some(_fp)) => {
            let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, fp)
        }
        // Two given, derive third.
        (Some(z), Some(b), None) => {
            let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, fp)
        }
        (Some(z), None, Some(fp)) => {
            let bz = (-fp.log2()).ceil() as u32;
            let b = ((bz + z as u32 - 1) / z as u32).max(1) as u8;
            let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, actual_fp)
        }
        (None, Some(b), Some(fp)) => {
            let bz = (-fp.log2()).ceil() as u32;
            let z = ((bz + b as u32 - 1) / b as u32).max(1) as u8;
            let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, actual_fp)
        }
        // One given, apply defaults for the other.
        (Some(z), None, None) => {
            let b = DEFAULT_B;
            let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, fp)
        }
        (None, Some(b), None) => {
            let z = DEFAULT_Z;
            let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, fp)
        }
        (None, None, Some(fp)) => {
            let b = DEFAULT_B;
            let z = ((bits_needed(fp) as u32 + b as u32 - 1) / b as u32).max(1) as u8;
            let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, actual_fp)
        }
        // None given: defaults.
        (None, None, None) => {
            let b = DEFAULT_B;
            let z = DEFAULT_Z;
            let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
            (z, b, fp)
        }
    }
 }
 pub fn run(args: IndexArgs) {
    let output = args.output.clone();
    let mut rep = Reporter::new();
    // ── Resolve evidence kind ────────────────────────────────────────────────
    let evidence = if args.approx {
        let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
        info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
        EvidenceKind::Approx { b, z }
    } else {
        EvidenceKind::Exact
    };
    // ── Open or create the index ─────────────────────────────────────────────
    let mut idx = if KmerIndex::exists(&output) && !args.force {
        info!("resuming from existing index at {}", output.display());
@@ -81,6 +184,7 @@ pub fn run(args: IndexArgs) {
            minimizer_size: args.common.minimizer_size,
            n_bits,
            with_counts:    args.with_counts,
            evidence:       evidence.clone(),
        };
        let genome_info = args.label.as_ref().map(|label| {
            let mut info = GenomeInfo::new(label.clone());
@@ -1,6 +1,7 @@
 pub mod annotate;
 pub mod distance;
 pub mod dump;
 pub mod estimate;
 pub mod index;
 pub mod merge;
 pub mod query;
@@ -32,6 +32,8 @@ enum Commands {
    Distance(cmd::distance::DistanceArgs),
    /// Dump unitigs from a built index to stdout (debug)
    Unitig(cmd::unitig::UnitigArgs),
    /// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing
    Estimate(cmd::estimate::EstimateArgs),
 }
 fn main() {
@@ -62,6 +64,7 @@ fn main() {
        Commands::Annotate(args)  => cmd::annotate::run(args),
        Commands::Distance(args)  => cmd::distance::run(args),
        Commands::Unitig(args)    => cmd::unitig::run(args),
        Commands::Estimate(args)  => cmd::estimate::run(args),
    }
    #[cfg(feature = "profiling")]
@@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec};
 use epserde::prelude::*;
 use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
 use obidebruinj::GraphDeBruijn;
-use obilayeredmap::{OLMError, layer::Layer};
+use obilayeredmap::{EvidenceKind, OLMError, layer::Layer};
 use obilayeredmap::meta::PartitionMeta;
 use obiskio::{SKError, SKFileMeta, SKFileReader};
 use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
@@ -44,6 +44,7 @@ impl KmerPartition {
        min_ab: u32,
        max_ab: Option<u32>,
        with_counts: bool,
        evidence: &EvidenceKind,
    ) -> Result<usize, SKError> {
        let part_dir = self.part_dir(i);
        let dedup_path = part_dir.join("dereplicated.skmer.zst");
@@ -119,6 +120,12 @@ impl KmerPartition {
            Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
        }
        // For approximate evidence: replace the exact evidence bundle with a
        // fingerprint.  For exact evidence, build() already wrote it.
        if let EvidenceKind::Approx { b, z } = evidence {
            Layer::<()>::build_approx_evidence(&layer_dir, *b, *z).map_err(olm_to_sk)?;
        }
        // Write meta.json in the index/ directory so LayeredMap::open works
        // (e.g. for subsequent merge operations).
        let index_dir = layer_dir.parent().expect("layer_dir has a parent");
@@ -10,6 +10,7 @@ use obikseq::CanonicalKmer;
 use obiskio::{UnitigFileReader, UnitigFileWriter};
 use crate::error::{OLMError, OLMResult};
 use crate::meta::EvidenceKind;
 use crate::mphf_layer::MphfLayer;
 pub(crate) use crate::mphf_layer::UNITIGS_FILE;
@@ -93,6 +94,12 @@ impl<D: LayerData> Layer<D> {
    pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
        MphfLayer::build_approx_evidence(layer_dir, b, z)
    }
    /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
    /// `kind`.
    pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
        MphfLayer::build_evidence(layer_dir, kind)
    }
 }
 // ── Mode 1 — set membership ───────────────────────────────────────────────────
@@ -17,9 +17,10 @@ pub enum EvidenceKind {
    /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`.  Zero false positives.
    Exact,
    /// Approximate evidence: `fingerprint.bin` only.
-    /// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
+    /// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b.
    /// `z` — consecutive k-mers that must all match (Findere trick);
-    ///        effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
+    ///        effective FP rate per sequencing read ≈ W / 2^(b·z)
    ///        where W = L - k - z + 2 is the number of windows in a read of length L.
    Approx { b: u8, z: u8 },
 }
@@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
 use crate::error::{OLMError, OLMResult};
 use crate::evidence::{Evidence, EvidenceWriter};
 use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
-use crate::meta::LayerMeta;
+use crate::meta::{EvidenceKind, LayerMeta};
 pub(crate) const MPHF_FILE:        &str = "mphf.bin";
 pub(crate) const UNITIGS_FILE:     &str = "unitigs.bin";
@@ -19,80 +19,117 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
 pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
 // ── Evidence store ────────────────────────────────────────────────────────────
 enum LayerEvidence {
    Exact { evidence: Evidence, unitigs: UnitigFileReader },
    Approx { fingerprint: FingerprintVec },
 }
 // ── MphfLayer ─────────────────────────────────────────────────────────────────
 /// Autonomous kmer → slot mapping for one layer.
 ///
-/// Answers presence/absence queries without any attached DataStore.
+/// Dispatches queries to exact or approximate evidence transparently based on
-/// Build once, never rebuilt — data stores are attached and derived externally.
+/// the `layer_meta.json` written at build time.
 pub struct MphfLayer {
    mphf: Mphf,
-    evidence: Evidence,
+    ev:   LayerEvidence,
    unitigs:  UnitigFileReader,
    n:    usize,
 }
 impl MphfLayer {
    pub fn open(dir: &Path) -> OLMResult<Self> {
        let meta = LayerMeta::load(dir)?;
        let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
            .map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
-        let unitigs  = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
+        let (ev, n) = match meta.evidence {
            EvidenceKind::Exact => {
                let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
                let n = evidence.len();
-        Ok(Self { mphf, evidence, unitigs, n })
+                let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
                (LayerEvidence::Exact { evidence, unitigs }, n)
            }
            EvidenceKind::Approx { .. } => {
                let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
                let n = fingerprint.n();
                (LayerEvidence::Approx { fingerprint }, n)
            }
        };
        Ok(Self { mphf, ev, n })
    }
-    /// Returns `Some(slot)` if `kmer` belongs to this layer, `None` otherwise.
+    // ── Query API ─────────────────────────────────────────────────────────────
    /// Transparent dispatch: routes to `find_exact` or `find_approx` based on
    /// the evidence loaded at `open` time.
    #[inline]
    pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
        match &self.ev {
            LayerEvidence::Exact { .. }  => self.find_exact(kmer),
            LayerEvidence::Approx { .. } => self.find_approx(kmer),
        }
    }
    /// Exact lookup: zero false positives.  Panics if the layer was opened with
    /// approximate evidence.
    #[inline]
    pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> {
        let LayerEvidence::Exact { evidence, unitigs } = &self.ev else {
            panic!("find_exact called on an approximate layer");
        };
        let slot = self.mphf.index(&kmer.raw());
        if slot >= self.n { return None; }
-        let (chunk_id, rank) = self.evidence.decode(slot);
+        let (chunk_id, rank) = evidence.decode(slot);
-        if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
+        if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
            Some(slot)
        } else {
            None
        }
    }
-    /// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise.
+    /// Approximate lookup: false-positive rate 1/2^b per k-mer query.  Panics
-    ///
+    /// if the layer was opened with exact evidence.
    /// False positive rate per query: 1/2^b (where b is the fingerprint width
    /// used at build time).  No `.idx` or `evidence.bin` needed at query time.
    #[inline]
-    pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option<usize> {
+    pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
        let LayerEvidence::Approx { fingerprint } = &self.ev else {
            panic!("find_approx called on an exact layer");
        };
        let slot = self.mphf.index(&kmer.raw());
        if slot >= self.n { return None; }
-        if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
+        if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
    }
    pub fn n(&self) -> usize { self.n }
    // ── Build helpers ─────────────────────────────────────────────────────────
    pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
        fs::create_dir_all(dir)?;
        Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
    }
-    /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
+    /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
-    /// `mphf.bin` already present in `dir`.
+    /// `kind`.
    pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
        match kind {
            EvidenceKind::Exact              => Self::build_exact_evidence(dir),
            EvidenceKind::Approx { b, z }    => Self::build_approx_evidence(dir, *b, *z),
        }
    }
    /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
    ///
-    /// This is the exact-evidence construction route.  It can be called:
+    /// Uses sequential iteration — no `.idx` required on entry.
    /// - after the initial build (via [`Self::build`] which calls it internally)
    /// - standalone to promote an existing (unitigs + mphf) into an exact index
    /// - standalone to rebuild evidence after a format change
    /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
    /// `mphf.bin` already present in `dir`.
    ///
    /// Uses sequential iteration — no `.idx` is required on entry.
    /// Writes both `evidence.bin` and `unitigs.bin.idx` on success.
    pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
        let unitig_path = dir.join(UNITIGS_FILE);
        // Sequential scan — no .idx required for iteration
        let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
        let n = unitigs.n_kmers();
        if n == 0 {
            fs::File::create(dir.join(EVIDENCE_FILE))?;
            build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
            LayerMeta::exact().save(dir)?;
            return Ok(0);
        }
@@ -117,18 +154,14 @@ impl MphfLayer {
        }
        ev.write(&dir.join(EVIDENCE_FILE))?;
        // Write .idx last: it is only needed for random access (queries).
        build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
        LayerMeta::exact().save(dir)?;
        Ok(n)
    }
-    /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
+    /// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
-    /// in `dir`.  `b` — fingerprint bits (1..=64); `z` — Findere consecutive
+    /// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
-    /// k-mer parameter (≥1).
+    /// parameter (≥1).  No `.idx` is written.
    ///
    /// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
    /// No `.idx` file is written — approximate evidence needs no random access.
    pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
        if b == 0 || b > 64 {
            return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
@@ -175,8 +208,6 @@ impl MphfLayer {
        let unitig_path = dir.join(UNITIGS_FILE);
        // Write .idx so that UnitigFileReader::open succeeds and parallel
        // random access is available for MPHF construction.
        build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
        let unitigs = UnitigFileReader::open(&unitig_path)?;
@@ -189,10 +220,11 @@ impl MphfLayer {
                    .ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
            mphf.store(&dir.join(MPHF_FILE))
                .map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
            LayerMeta::exact().save(dir)?;
            return Ok(0);
        }
-        // Pass 1 — build MPHF (parallel, uses random access via .idx)
+        // Pass 1 — build MPHF (parallel, random access via .idx)
        let keys = (0..unitigs.len())
            .into_par_iter()
            .flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));