diff --git a/src/Cargo.lock b/src/Cargo.lock index bc710eb..dd85960 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1529,6 +1529,7 @@ dependencies = [ "obikpartitionner", "obikrope", "obikseq", + "obilayeredmap", "obipipeline", "obiread", "obiskbuilder", diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 502822e..321d905 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -144,6 +144,7 @@ impl KmerIndex { let n = self.partition.n_partitions(); let t = Stage::start("index"); let with_counts = self.meta.config.with_counts; + let evidence = self.meta.config.evidence.clone(); let total_kmers = AtomicUsize::new(0); let pb = Arc::new(Mutex::new( @@ -153,7 +154,7 @@ impl KmerIndex { )); (0..n).into_par_iter().for_each(|i| { - match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) { + match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) { Ok(0) => {} Ok(n_kmers) => { total_kmers.fetch_add(n_kmers, Ordering::Relaxed); diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index 8d44035..cb7a7a2 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -18,6 +18,7 @@ obikpartitionner = { path = "../obikpartitionner" } obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } +obilayeredmap = { path = "../obilayeredmap" } clap = { version = "4", features = ["derive"] } serde_json = "1" csv = "1" diff --git a/src/obikmer/src/cmd/estimate.rs b/src/obikmer/src/cmd/estimate.rs new file mode 100644 index 0000000..0d8fe98 --- /dev/null +++ b/src/obikmer/src/cmd/estimate.rs @@ -0,0 +1,38 @@ +use clap::Args; + +use super::index::resolve_approx_params; + +#[derive(Args)] +pub struct EstimateArgs { + /// k-mer size used for querying (same as --kmer-size in index) + #[arg(short = 'k', long, default_value_t = 31)] + pub kmer_size: usize, + + /// Findere z parameter: number of consecutive k-mers that must all match. + /// Effective indexed k-mer size is kmer_size - z + 1. + #[arg(short = 'z', long, default_value = None)] + pub findere_z: Option, + + /// Fingerprint bits per slot (b). FP per z-window = 1/2^(b·z). + #[arg(long, default_value = None)] + pub evidence_bits: Option, + + /// Target false-positive rate per z-window (e.g. 0.01). + #[arg(long, default_value = None)] + pub fp: Option, +} + +pub fn run(args: EstimateArgs) { + let (z, b, fp_window) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); + + let k_query = args.kmer_size; + let k_index = k_query.saturating_sub(z as usize - 1); + let fp_kmer = 1.0_f64 / 2_f64.powi(b as i32); + + println!("{:<22} {}", "k (query):", k_query); + println!("{:<22} {}", "k (indexed):", k_index); + println!("{:<22} {}", "z:", z); + println!("{:<22} {}", "evidence bits (b):", b); + println!("{:<22} {:.3e} (1/2^{})", "FP per k-mer:", fp_kmer, b); + println!("{:<22} {:.3e} (1/2^{})", "FP per z-window:", fp_window, b as u32 * z as u32); +} diff --git a/src/obikmer/src/cmd/index.rs b/src/obikmer/src/cmd/index.rs index 5559f95..d5c9999 100644 --- a/src/obikmer/src/cmd/index.rs +++ b/src/obikmer/src/cmd/index.rs @@ -2,6 +2,7 @@ use std::path::PathBuf; use clap::Args; use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex}; +use obilayeredmap::EvidenceKind; fn parse_key_value(s: &str) -> Result<(String, String), String> { let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?; @@ -48,14 +49,116 @@ pub struct IndexArgs { #[arg(long, default_value_t = false)] pub keep_intermediate: bool, + /// Use approximate (fingerprint-based) evidence instead of exact evidence. + /// False-positive rate per z-window: 1/2^(b·z). + #[arg(long, default_value_t = false)] + pub approx: bool, + + /// Findere z parameter: number of consecutive k-mers that must all match. + /// Effective indexed k-mer size is kmer_size - z + 1. + #[arg(short = 'z', long, default_value = None)] + pub findere_z: Option, + + /// Fingerprint bits per slot (b). FP per z-window = 1/2^(b·z). + #[arg(long, default_value = None)] + pub evidence_bits: Option, + + /// Target false-positive rate per z-window (e.g. 0.01). + /// Used to derive missing b or z. + #[arg(long, default_value = None)] + pub fp: Option, + #[command(flatten)] pub common: CommonArgs, } +/// Resolve the (z, b, fp) triplet from the user-supplied subset. +/// +/// Model: FP = 1/2^(b·z) ⟹ b·z = ⌈-log₂(fp)⌉ +/// +/// Rules when one value is missing (conservative = ceiling): +/// given z, b → fp = 1/2^(b·z) +/// given z, fp → b = ⌈-log₂(fp) / z⌉ +/// given b, fp → z = ⌈-log₂(fp) / b⌉ +/// given z only → b = 8 (default), fp derived +/// given b only → z = 1 (default), fp derived +/// given fp only → b = 8 (default), z derived +/// none given → z = 1, b = 8, fp = 1/256 +pub(crate) fn resolve_approx_params( + z_opt: Option, + b_opt: Option, + fp_opt: Option, +) -> (u8, u8, f64) { + const DEFAULT_B: u8 = 8; + const DEFAULT_Z: u8 = 1; + + let bits_needed = |fp: f64| -> u8 { + (-fp.log2()).ceil() as u8 + }; + + match (z_opt, b_opt, fp_opt) { + // All three given: use b and z, recompute fp conservatively. + (Some(z), Some(b), Some(_fp)) => { + let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, fp) + } + // Two given, derive third. + (Some(z), Some(b), None) => { + let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, fp) + } + (Some(z), None, Some(fp)) => { + let bz = (-fp.log2()).ceil() as u32; + let b = ((bz + z as u32 - 1) / z as u32).max(1) as u8; + let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, actual_fp) + } + (None, Some(b), Some(fp)) => { + let bz = (-fp.log2()).ceil() as u32; + let z = ((bz + b as u32 - 1) / b as u32).max(1) as u8; + let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, actual_fp) + } + // One given, apply defaults for the other. + (Some(z), None, None) => { + let b = DEFAULT_B; + let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, fp) + } + (None, Some(b), None) => { + let z = DEFAULT_Z; + let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, fp) + } + (None, None, Some(fp)) => { + let b = DEFAULT_B; + let z = ((bits_needed(fp) as u32 + b as u32 - 1) / b as u32).max(1) as u8; + let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, actual_fp) + } + // None given: defaults. + (None, None, None) => { + let b = DEFAULT_B; + let z = DEFAULT_Z; + let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64; + (z, b, fp) + } + } +} + pub fn run(args: IndexArgs) { let output = args.output.clone(); let mut rep = Reporter::new(); + // ── Resolve evidence kind ──────────────────────────────────────────────── + let evidence = if args.approx { + let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp); + info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}"); + EvidenceKind::Approx { b, z } + } else { + EvidenceKind::Exact + }; + // ── Open or create the index ───────────────────────────────────────────── let mut idx = if KmerIndex::exists(&output) && !args.force { info!("resuming from existing index at {}", output.display()); @@ -81,6 +184,7 @@ pub fn run(args: IndexArgs) { minimizer_size: args.common.minimizer_size, n_bits, with_counts: args.with_counts, + evidence: evidence.clone(), }; let genome_info = args.label.as_ref().map(|label| { let mut info = GenomeInfo::new(label.clone()); diff --git a/src/obikmer/src/cmd/mod.rs b/src/obikmer/src/cmd/mod.rs index 43fc24a..1645c23 100644 --- a/src/obikmer/src/cmd/mod.rs +++ b/src/obikmer/src/cmd/mod.rs @@ -1,6 +1,7 @@ pub mod annotate; pub mod distance; pub mod dump; +pub mod estimate; pub mod index; pub mod merge; pub mod query; diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index c9f4a12..66665f8 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -32,6 +32,8 @@ enum Commands { Distance(cmd::distance::DistanceArgs), /// Dump unitigs from a built index to stdout (debug) Unitig(cmd::unitig::UnitigArgs), + /// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing + Estimate(cmd::estimate::EstimateArgs), } fn main() { @@ -62,6 +64,7 @@ fn main() { Commands::Annotate(args) => cmd::annotate::run(args), Commands::Distance(args) => cmd::distance::run(args), Commands::Unitig(args) => cmd::unitig::run(args), + Commands::Estimate(args) => cmd::estimate::run(args), } #[cfg(feature = "profiling")] diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs index a6401b5..e51f7cc 100644 --- a/src/obikpartitionner/src/index_layer.rs +++ b/src/obikpartitionner/src/index_layer.rs @@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; use obidebruinj::GraphDeBruijn; -use obilayeredmap::{OLMError, layer::Layer}; +use obilayeredmap::{EvidenceKind, OLMError, layer::Layer}; use obilayeredmap::meta::PartitionMeta; use obiskio::{SKError, SKFileMeta, SKFileReader}; use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; @@ -44,6 +44,7 @@ impl KmerPartition { min_ab: u32, max_ab: Option, with_counts: bool, + evidence: &EvidenceKind, ) -> Result { let part_dir = self.part_dir(i); let dedup_path = part_dir.join("dereplicated.skmer.zst"); @@ -119,6 +120,12 @@ impl KmerPartition { Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?; } + // For approximate evidence: replace the exact evidence bundle with a + // fingerprint. For exact evidence, build() already wrote it. + if let EvidenceKind::Approx { b, z } = evidence { + Layer::<()>::build_approx_evidence(&layer_dir, *b, *z).map_err(olm_to_sk)?; + } + // Write meta.json in the index/ directory so LayeredMap::open works // (e.g. for subsequent merge operations). let index_dir = layer_dir.parent().expect("layer_dir has a parent"); diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 04f95ec..e5f531b 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -10,6 +10,7 @@ use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; use crate::error::{OLMError, OLMResult}; +use crate::meta::EvidenceKind; use crate::mphf_layer::MphfLayer; pub(crate) use crate::mphf_layer::UNITIGS_FILE; @@ -93,6 +94,12 @@ impl Layer { pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult { MphfLayer::build_approx_evidence(layer_dir, b, z) } + + /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on + /// `kind`. + pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult { + MphfLayer::build_evidence(layer_dir, kind) + } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── diff --git a/src/obilayeredmap/src/meta.rs b/src/obilayeredmap/src/meta.rs index bc8d93e..352cbdd 100644 --- a/src/obilayeredmap/src/meta.rs +++ b/src/obilayeredmap/src/meta.rs @@ -17,9 +17,10 @@ pub enum EvidenceKind { /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives. Exact, /// Approximate evidence: `fingerprint.bin` only. - /// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b. + /// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b. /// `z` — consecutive k-mers that must all match (Findere trick); - /// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows. + /// effective FP rate per sequencing read ≈ W / 2^(b·z) + /// where W = L - k - z + 2 is the number of windows in a read of length L. Approx { b: u8, z: u8 }, } diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index e5d4cf2..d6b9510 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; -use crate::meta::LayerMeta; +use crate::meta::{EvidenceKind, LayerMeta}; pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; @@ -19,80 +19,117 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin"; pub(crate) type Mphf = PtrHash>, Xx64, Vec>; +// ── Evidence store ──────────────────────────────────────────────────────────── + +enum LayerEvidence { + Exact { evidence: Evidence, unitigs: UnitigFileReader }, + Approx { fingerprint: FingerprintVec }, +} + +// ── MphfLayer ───────────────────────────────────────────────────────────────── + /// Autonomous kmer → slot mapping for one layer. /// -/// Answers presence/absence queries without any attached DataStore. -/// Build once, never rebuilt — data stores are attached and derived externally. +/// Dispatches queries to exact or approximate evidence transparently based on +/// the `layer_meta.json` written at build time. pub struct MphfLayer { - mphf: Mphf, - evidence: Evidence, - unitigs: UnitigFileReader, - n: usize, + mphf: Mphf, + ev: LayerEvidence, + n: usize, } impl MphfLayer { pub fn open(dir: &Path) -> OLMResult { + let meta = LayerMeta::load(dir)?; let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; - let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; - let n = evidence.len(); - Ok(Self { mphf, evidence, unitigs, n }) + let (ev, n) = match meta.evidence { + EvidenceKind::Exact => { + let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; + let n = evidence.len(); + let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; + (LayerEvidence::Exact { evidence, unitigs }, n) + } + EvidenceKind::Approx { .. } => { + let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; + let n = fingerprint.n(); + (LayerEvidence::Approx { fingerprint }, n) + } + }; + Ok(Self { mphf, ev, n }) } - /// Returns `Some(slot)` if `kmer` belongs to this layer, `None` otherwise. + // ── Query API ───────────────────────────────────────────────────────────── + + /// Transparent dispatch: routes to `find_exact` or `find_approx` based on + /// the evidence loaded at `open` time. #[inline] pub fn find(&self, kmer: CanonicalKmer) -> Option { + match &self.ev { + LayerEvidence::Exact { .. } => self.find_exact(kmer), + LayerEvidence::Approx { .. } => self.find_approx(kmer), + } + } + + /// Exact lookup: zero false positives. Panics if the layer was opened with + /// approximate evidence. + #[inline] + pub fn find_exact(&self, kmer: CanonicalKmer) -> Option { + let LayerEvidence::Exact { evidence, unitigs } = &self.ev else { + panic!("find_exact called on an approximate layer"); + }; let slot = self.mphf.index(&kmer.raw()); if slot >= self.n { return None; } - let (chunk_id, rank) = self.evidence.decode(slot); - if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + let (chunk_id, rank) = evidence.decode(slot); + if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { Some(slot) } else { None } } - /// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise. - /// - /// False positive rate per query: 1/2^b (where b is the fingerprint width - /// used at build time). No `.idx` or `evidence.bin` needed at query time. + /// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics + /// if the layer was opened with exact evidence. #[inline] - pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option { + pub fn find_approx(&self, kmer: CanonicalKmer) -> Option { + let LayerEvidence::Approx { fingerprint } = &self.ev else { + panic!("find_approx called on an exact layer"); + }; let slot = self.mphf.index(&kmer.raw()); if slot >= self.n { return None; } - if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } + if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } } pub fn n(&self) -> usize { self.n } + // ── Build helpers ───────────────────────────────────────────────────────── + pub fn unitig_writer(dir: &Path) -> OLMResult { fs::create_dir_all(dir)?; Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?) } - /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and - /// `mphf.bin` already present in `dir`. + /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on + /// `kind`. + pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult { + match kind { + EvidenceKind::Exact => Self::build_exact_evidence(dir), + EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z), + } + } + + /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`. /// - /// This is the exact-evidence construction route. It can be called: - /// - after the initial build (via [`Self::build`] which calls it internally) - /// - standalone to promote an existing (unitigs + mphf) into an exact index - /// - standalone to rebuild evidence after a format change - /// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and - /// `mphf.bin` already present in `dir`. - /// - /// Uses sequential iteration — no `.idx` is required on entry. - /// Writes both `evidence.bin` and `unitigs.bin.idx` on success. + /// Uses sequential iteration — no `.idx` required on entry. pub fn build_exact_evidence(dir: &Path) -> OLMResult { let unitig_path = dir.join(UNITIGS_FILE); - - // Sequential scan — no .idx required for iteration let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; let n = unitigs.n_kmers(); if n == 0 { fs::File::create(dir.join(EVIDENCE_FILE))?; build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + LayerMeta::exact().save(dir)?; return Ok(0); } @@ -117,18 +154,14 @@ impl MphfLayer { } ev.write(&dir.join(EVIDENCE_FILE))?; - // Write .idx last: it is only needed for random access (queries). build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; LayerMeta::exact().save(dir)?; Ok(n) } - /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present - /// in `dir`. `b` — fingerprint bits (1..=64); `z` — Findere consecutive - /// k-mer parameter (≥1). - /// - /// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`. - /// No `.idx` file is written — approximate evidence needs no random access. + /// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`. + /// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer + /// parameter (≥1). No `.idx` is written. pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult { if b == 0 || b > 64 { return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); @@ -175,8 +208,6 @@ impl MphfLayer { let unitig_path = dir.join(UNITIGS_FILE); - // Write .idx so that UnitigFileReader::open succeeds and parallel - // random access is available for MPHF construction. build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; let unitigs = UnitigFileReader::open(&unitig_path)?; @@ -189,10 +220,11 @@ impl MphfLayer { .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; mphf.store(&dir.join(MPHF_FILE)) .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + LayerMeta::exact().save(dir)?; return Ok(0); } - // Pass 1 — build MPHF (parallel, uses random access via .idx) + // Pass 1 — build MPHF (parallel, random access via .idx) let keys = (0..unitigs.len()) .into_par_iter() .flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));