feat: add approximate evidence matching and index estimation CLI
Introduces a new `estimate` CLI subcommand to calculate bloom filter size, evidence bits, and false-positive rates for approximate indexing. Updates the index building and querying pipelines to support both exact and approximate evidence types via a unified `EvidenceKind` abstraction. Refactors `MphfLayer` and partition index builders to route operations based on the selected evidence mode, and adds the required `obilayeredmap` dependency.
This commit is contained in:
Generated
+1
@@ -1529,6 +1529,7 @@ dependencies = [
|
|||||||
"obikpartitionner",
|
"obikpartitionner",
|
||||||
"obikrope",
|
"obikrope",
|
||||||
"obikseq",
|
"obikseq",
|
||||||
|
"obilayeredmap",
|
||||||
"obipipeline",
|
"obipipeline",
|
||||||
"obiread",
|
"obiread",
|
||||||
"obiskbuilder",
|
"obiskbuilder",
|
||||||
|
|||||||
@@ -144,6 +144,7 @@ impl KmerIndex {
|
|||||||
let n = self.partition.n_partitions();
|
let n = self.partition.n_partitions();
|
||||||
let t = Stage::start("index");
|
let t = Stage::start("index");
|
||||||
let with_counts = self.meta.config.with_counts;
|
let with_counts = self.meta.config.with_counts;
|
||||||
|
let evidence = self.meta.config.evidence.clone();
|
||||||
let total_kmers = AtomicUsize::new(0);
|
let total_kmers = AtomicUsize::new(0);
|
||||||
|
|
||||||
let pb = Arc::new(Mutex::new(
|
let pb = Arc::new(Mutex::new(
|
||||||
@@ -153,7 +154,7 @@ impl KmerIndex {
|
|||||||
));
|
));
|
||||||
|
|
||||||
(0..n).into_par_iter().for_each(|i| {
|
(0..n).into_par_iter().for_each(|i| {
|
||||||
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts) {
|
match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence) {
|
||||||
Ok(0) => {}
|
Ok(0) => {}
|
||||||
Ok(n_kmers) => {
|
Ok(n_kmers) => {
|
||||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ obikpartitionner = { path = "../obikpartitionner" }
|
|||||||
obisys = { path = "../obisys" }
|
obisys = { path = "../obisys" }
|
||||||
obiskio = { path = "../obiskio" }
|
obiskio = { path = "../obiskio" }
|
||||||
obikindex = { path = "../obikindex" }
|
obikindex = { path = "../obikindex" }
|
||||||
|
obilayeredmap = { path = "../obilayeredmap" }
|
||||||
clap = { version = "4", features = ["derive"] }
|
clap = { version = "4", features = ["derive"] }
|
||||||
serde_json = "1"
|
serde_json = "1"
|
||||||
csv = "1"
|
csv = "1"
|
||||||
|
|||||||
@@ -0,0 +1,38 @@
|
|||||||
|
use clap::Args;
|
||||||
|
|
||||||
|
use super::index::resolve_approx_params;
|
||||||
|
|
||||||
|
#[derive(Args)]
|
||||||
|
pub struct EstimateArgs {
|
||||||
|
/// k-mer size used for querying (same as --kmer-size in index)
|
||||||
|
#[arg(short = 'k', long, default_value_t = 31)]
|
||||||
|
pub kmer_size: usize,
|
||||||
|
|
||||||
|
/// Findere z parameter: number of consecutive k-mers that must all match.
|
||||||
|
/// Effective indexed k-mer size is kmer_size - z + 1.
|
||||||
|
#[arg(short = 'z', long, default_value = None)]
|
||||||
|
pub findere_z: Option<u8>,
|
||||||
|
|
||||||
|
/// Fingerprint bits per slot (b). FP per z-window = 1/2^(b·z).
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub evidence_bits: Option<u8>,
|
||||||
|
|
||||||
|
/// Target false-positive rate per z-window (e.g. 0.01).
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub fp: Option<f64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(args: EstimateArgs) {
|
||||||
|
let (z, b, fp_window) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||||
|
|
||||||
|
let k_query = args.kmer_size;
|
||||||
|
let k_index = k_query.saturating_sub(z as usize - 1);
|
||||||
|
let fp_kmer = 1.0_f64 / 2_f64.powi(b as i32);
|
||||||
|
|
||||||
|
println!("{:<22} {}", "k (query):", k_query);
|
||||||
|
println!("{:<22} {}", "k (indexed):", k_index);
|
||||||
|
println!("{:<22} {}", "z:", z);
|
||||||
|
println!("{:<22} {}", "evidence bits (b):", b);
|
||||||
|
println!("{:<22} {:.3e} (1/2^{})", "FP per k-mer:", fp_kmer, b);
|
||||||
|
println!("{:<22} {:.3e} (1/2^{})", "FP per z-window:", fp_window, b as u32 * z as u32);
|
||||||
|
}
|
||||||
@@ -2,6 +2,7 @@ use std::path::PathBuf;
|
|||||||
|
|
||||||
use clap::Args;
|
use clap::Args;
|
||||||
use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
|
use obikindex::{GenomeInfo, IndexConfig, IndexState, KmerIndex};
|
||||||
|
use obilayeredmap::EvidenceKind;
|
||||||
|
|
||||||
fn parse_key_value(s: &str) -> Result<(String, String), String> {
|
fn parse_key_value(s: &str) -> Result<(String, String), String> {
|
||||||
let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?;
|
let pos = s.find('=').ok_or_else(|| format!("invalid key=value: no '=' in '{s}'"))?;
|
||||||
@@ -48,14 +49,116 @@ pub struct IndexArgs {
|
|||||||
#[arg(long, default_value_t = false)]
|
#[arg(long, default_value_t = false)]
|
||||||
pub keep_intermediate: bool,
|
pub keep_intermediate: bool,
|
||||||
|
|
||||||
|
/// Use approximate (fingerprint-based) evidence instead of exact evidence.
|
||||||
|
/// False-positive rate per z-window: 1/2^(b·z).
|
||||||
|
#[arg(long, default_value_t = false)]
|
||||||
|
pub approx: bool,
|
||||||
|
|
||||||
|
/// Findere z parameter: number of consecutive k-mers that must all match.
|
||||||
|
/// Effective indexed k-mer size is kmer_size - z + 1.
|
||||||
|
#[arg(short = 'z', long, default_value = None)]
|
||||||
|
pub findere_z: Option<u8>,
|
||||||
|
|
||||||
|
/// Fingerprint bits per slot (b). FP per z-window = 1/2^(b·z).
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub evidence_bits: Option<u8>,
|
||||||
|
|
||||||
|
/// Target false-positive rate per z-window (e.g. 0.01).
|
||||||
|
/// Used to derive missing b or z.
|
||||||
|
#[arg(long, default_value = None)]
|
||||||
|
pub fp: Option<f64>,
|
||||||
|
|
||||||
#[command(flatten)]
|
#[command(flatten)]
|
||||||
pub common: CommonArgs,
|
pub common: CommonArgs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resolve the (z, b, fp) triplet from the user-supplied subset.
|
||||||
|
///
|
||||||
|
/// Model: FP = 1/2^(b·z) ⟹ b·z = ⌈-log₂(fp)⌉
|
||||||
|
///
|
||||||
|
/// Rules when one value is missing (conservative = ceiling):
|
||||||
|
/// given z, b → fp = 1/2^(b·z)
|
||||||
|
/// given z, fp → b = ⌈-log₂(fp) / z⌉
|
||||||
|
/// given b, fp → z = ⌈-log₂(fp) / b⌉
|
||||||
|
/// given z only → b = 8 (default), fp derived
|
||||||
|
/// given b only → z = 1 (default), fp derived
|
||||||
|
/// given fp only → b = 8 (default), z derived
|
||||||
|
/// none given → z = 1, b = 8, fp = 1/256
|
||||||
|
pub(crate) fn resolve_approx_params(
|
||||||
|
z_opt: Option<u8>,
|
||||||
|
b_opt: Option<u8>,
|
||||||
|
fp_opt: Option<f64>,
|
||||||
|
) -> (u8, u8, f64) {
|
||||||
|
const DEFAULT_B: u8 = 8;
|
||||||
|
const DEFAULT_Z: u8 = 1;
|
||||||
|
|
||||||
|
let bits_needed = |fp: f64| -> u8 {
|
||||||
|
(-fp.log2()).ceil() as u8
|
||||||
|
};
|
||||||
|
|
||||||
|
match (z_opt, b_opt, fp_opt) {
|
||||||
|
// All three given: use b and z, recompute fp conservatively.
|
||||||
|
(Some(z), Some(b), Some(_fp)) => {
|
||||||
|
let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, fp)
|
||||||
|
}
|
||||||
|
// Two given, derive third.
|
||||||
|
(Some(z), Some(b), None) => {
|
||||||
|
let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, fp)
|
||||||
|
}
|
||||||
|
(Some(z), None, Some(fp)) => {
|
||||||
|
let bz = (-fp.log2()).ceil() as u32;
|
||||||
|
let b = ((bz + z as u32 - 1) / z as u32).max(1) as u8;
|
||||||
|
let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, actual_fp)
|
||||||
|
}
|
||||||
|
(None, Some(b), Some(fp)) => {
|
||||||
|
let bz = (-fp.log2()).ceil() as u32;
|
||||||
|
let z = ((bz + b as u32 - 1) / b as u32).max(1) as u8;
|
||||||
|
let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, actual_fp)
|
||||||
|
}
|
||||||
|
// One given, apply defaults for the other.
|
||||||
|
(Some(z), None, None) => {
|
||||||
|
let b = DEFAULT_B;
|
||||||
|
let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, fp)
|
||||||
|
}
|
||||||
|
(None, Some(b), None) => {
|
||||||
|
let z = DEFAULT_Z;
|
||||||
|
let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, fp)
|
||||||
|
}
|
||||||
|
(None, None, Some(fp)) => {
|
||||||
|
let b = DEFAULT_B;
|
||||||
|
let z = ((bits_needed(fp) as u32 + b as u32 - 1) / b as u32).max(1) as u8;
|
||||||
|
let actual_fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, actual_fp)
|
||||||
|
}
|
||||||
|
// None given: defaults.
|
||||||
|
(None, None, None) => {
|
||||||
|
let b = DEFAULT_B;
|
||||||
|
let z = DEFAULT_Z;
|
||||||
|
let fp = 1.0_f64 / (1u64 << (b as u32 * z as u32)) as f64;
|
||||||
|
(z, b, fp)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn run(args: IndexArgs) {
|
pub fn run(args: IndexArgs) {
|
||||||
let output = args.output.clone();
|
let output = args.output.clone();
|
||||||
let mut rep = Reporter::new();
|
let mut rep = Reporter::new();
|
||||||
|
|
||||||
|
// ── Resolve evidence kind ────────────────────────────────────────────────
|
||||||
|
let evidence = if args.approx {
|
||||||
|
let (z, b, fp) = resolve_approx_params(args.findere_z, args.evidence_bits, args.fp);
|
||||||
|
info!("approximate evidence: b={b}, z={z}, fp={fp:.2e}");
|
||||||
|
EvidenceKind::Approx { b, z }
|
||||||
|
} else {
|
||||||
|
EvidenceKind::Exact
|
||||||
|
};
|
||||||
|
|
||||||
// ── Open or create the index ─────────────────────────────────────────────
|
// ── Open or create the index ─────────────────────────────────────────────
|
||||||
let mut idx = if KmerIndex::exists(&output) && !args.force {
|
let mut idx = if KmerIndex::exists(&output) && !args.force {
|
||||||
info!("resuming from existing index at {}", output.display());
|
info!("resuming from existing index at {}", output.display());
|
||||||
@@ -81,6 +184,7 @@ pub fn run(args: IndexArgs) {
|
|||||||
minimizer_size: args.common.minimizer_size,
|
minimizer_size: args.common.minimizer_size,
|
||||||
n_bits,
|
n_bits,
|
||||||
with_counts: args.with_counts,
|
with_counts: args.with_counts,
|
||||||
|
evidence: evidence.clone(),
|
||||||
};
|
};
|
||||||
let genome_info = args.label.as_ref().map(|label| {
|
let genome_info = args.label.as_ref().map(|label| {
|
||||||
let mut info = GenomeInfo::new(label.clone());
|
let mut info = GenomeInfo::new(label.clone());
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
pub mod annotate;
|
pub mod annotate;
|
||||||
pub mod distance;
|
pub mod distance;
|
||||||
pub mod dump;
|
pub mod dump;
|
||||||
|
pub mod estimate;
|
||||||
pub mod index;
|
pub mod index;
|
||||||
pub mod merge;
|
pub mod merge;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
|
|||||||
@@ -32,6 +32,8 @@ enum Commands {
|
|||||||
Distance(cmd::distance::DistanceArgs),
|
Distance(cmd::distance::DistanceArgs),
|
||||||
/// Dump unitigs from a built index to stdout (debug)
|
/// Dump unitigs from a built index to stdout (debug)
|
||||||
Unitig(cmd::unitig::UnitigArgs),
|
Unitig(cmd::unitig::UnitigArgs),
|
||||||
|
/// Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing
|
||||||
|
Estimate(cmd::estimate::EstimateArgs),
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
@@ -62,6 +64,7 @@ fn main() {
|
|||||||
Commands::Annotate(args) => cmd::annotate::run(args),
|
Commands::Annotate(args) => cmd::annotate::run(args),
|
||||||
Commands::Distance(args) => cmd::distance::run(args),
|
Commands::Distance(args) => cmd::distance::run(args),
|
||||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||||
|
Commands::Estimate(args) => cmd::estimate::run(args),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(feature = "profiling")]
|
#[cfg(feature = "profiling")]
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec};
|
|||||||
use epserde::prelude::*;
|
use epserde::prelude::*;
|
||||||
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
|
||||||
use obidebruinj::GraphDeBruijn;
|
use obidebruinj::GraphDeBruijn;
|
||||||
use obilayeredmap::{OLMError, layer::Layer};
|
use obilayeredmap::{EvidenceKind, OLMError, layer::Layer};
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
use obiskio::{SKError, SKFileMeta, SKFileReader};
|
||||||
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
|
||||||
@@ -44,6 +44,7 @@ impl KmerPartition {
|
|||||||
min_ab: u32,
|
min_ab: u32,
|
||||||
max_ab: Option<u32>,
|
max_ab: Option<u32>,
|
||||||
with_counts: bool,
|
with_counts: bool,
|
||||||
|
evidence: &EvidenceKind,
|
||||||
) -> Result<usize, SKError> {
|
) -> Result<usize, SKError> {
|
||||||
let part_dir = self.part_dir(i);
|
let part_dir = self.part_dir(i);
|
||||||
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
let dedup_path = part_dir.join("dereplicated.skmer.zst");
|
||||||
@@ -119,6 +120,12 @@ impl KmerPartition {
|
|||||||
Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
|
Layer::<()>::build(&layer_dir).map_err(olm_to_sk)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For approximate evidence: replace the exact evidence bundle with a
|
||||||
|
// fingerprint. For exact evidence, build() already wrote it.
|
||||||
|
if let EvidenceKind::Approx { b, z } = evidence {
|
||||||
|
Layer::<()>::build_approx_evidence(&layer_dir, *b, *z).map_err(olm_to_sk)?;
|
||||||
|
}
|
||||||
|
|
||||||
// Write meta.json in the index/ directory so LayeredMap::open works
|
// Write meta.json in the index/ directory so LayeredMap::open works
|
||||||
// (e.g. for subsequent merge operations).
|
// (e.g. for subsequent merge operations).
|
||||||
let index_dir = layer_dir.parent().expect("layer_dir has a parent");
|
let index_dir = layer_dir.parent().expect("layer_dir has a parent");
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use obikseq::CanonicalKmer;
|
|||||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||||
|
|
||||||
use crate::error::{OLMError, OLMResult};
|
use crate::error::{OLMError, OLMResult};
|
||||||
|
use crate::meta::EvidenceKind;
|
||||||
use crate::mphf_layer::MphfLayer;
|
use crate::mphf_layer::MphfLayer;
|
||||||
pub(crate) use crate::mphf_layer::UNITIGS_FILE;
|
pub(crate) use crate::mphf_layer::UNITIGS_FILE;
|
||||||
|
|
||||||
@@ -93,6 +94,12 @@ impl<D: LayerData> Layer<D> {
|
|||||||
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||||
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||||
|
/// `kind`.
|
||||||
|
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
||||||
|
MphfLayer::build_evidence(layer_dir, kind)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||||
|
|||||||
@@ -17,9 +17,10 @@ pub enum EvidenceKind {
|
|||||||
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
|
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
|
||||||
Exact,
|
Exact,
|
||||||
/// Approximate evidence: `fingerprint.bin` only.
|
/// Approximate evidence: `fingerprint.bin` only.
|
||||||
/// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
|
/// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b.
|
||||||
/// `z` — consecutive k-mers that must all match (Findere trick);
|
/// `z` — consecutive k-mers that must all match (Findere trick);
|
||||||
/// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
|
/// effective FP rate per sequencing read ≈ W / 2^(b·z)
|
||||||
|
/// where W = L - k - z + 2 is the number of windows in a read of length L.
|
||||||
Approx { b: u8, z: u8 },
|
Approx { b: u8, z: u8 },
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
|||||||
use crate::error::{OLMError, OLMResult};
|
use crate::error::{OLMError, OLMResult};
|
||||||
use crate::evidence::{Evidence, EvidenceWriter};
|
use crate::evidence::{Evidence, EvidenceWriter};
|
||||||
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
||||||
use crate::meta::LayerMeta;
|
use crate::meta::{EvidenceKind, LayerMeta};
|
||||||
|
|
||||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||||
@@ -19,80 +19,117 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
|||||||
|
|
||||||
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||||
|
|
||||||
|
// ── Evidence store ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
enum LayerEvidence {
|
||||||
|
Exact { evidence: Evidence, unitigs: UnitigFileReader },
|
||||||
|
Approx { fingerprint: FingerprintVec },
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── MphfLayer ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/// Autonomous kmer → slot mapping for one layer.
|
/// Autonomous kmer → slot mapping for one layer.
|
||||||
///
|
///
|
||||||
/// Answers presence/absence queries without any attached DataStore.
|
/// Dispatches queries to exact or approximate evidence transparently based on
|
||||||
/// Build once, never rebuilt — data stores are attached and derived externally.
|
/// the `layer_meta.json` written at build time.
|
||||||
pub struct MphfLayer {
|
pub struct MphfLayer {
|
||||||
mphf: Mphf,
|
mphf: Mphf,
|
||||||
evidence: Evidence,
|
ev: LayerEvidence,
|
||||||
unitigs: UnitigFileReader,
|
|
||||||
n: usize,
|
n: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MphfLayer {
|
impl MphfLayer {
|
||||||
pub fn open(dir: &Path) -> OLMResult<Self> {
|
pub fn open(dir: &Path) -> OLMResult<Self> {
|
||||||
|
let meta = LayerMeta::load(dir)?;
|
||||||
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
||||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
|
let (ev, n) = match meta.evidence {
|
||||||
|
EvidenceKind::Exact => {
|
||||||
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
|
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
|
||||||
let n = evidence.len();
|
let n = evidence.len();
|
||||||
Ok(Self { mphf, evidence, unitigs, n })
|
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
|
||||||
|
(LayerEvidence::Exact { evidence, unitigs }, n)
|
||||||
|
}
|
||||||
|
EvidenceKind::Approx { .. } => {
|
||||||
|
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
|
||||||
|
let n = fingerprint.n();
|
||||||
|
(LayerEvidence::Approx { fingerprint }, n)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
Ok(Self { mphf, ev, n })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `Some(slot)` if `kmer` belongs to this layer, `None` otherwise.
|
// ── Query API ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Transparent dispatch: routes to `find_exact` or `find_approx` based on
|
||||||
|
/// the evidence loaded at `open` time.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||||
|
match &self.ev {
|
||||||
|
LayerEvidence::Exact { .. } => self.find_exact(kmer),
|
||||||
|
LayerEvidence::Approx { .. } => self.find_approx(kmer),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Exact lookup: zero false positives. Panics if the layer was opened with
|
||||||
|
/// approximate evidence.
|
||||||
|
#[inline]
|
||||||
|
pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||||
|
let LayerEvidence::Exact { evidence, unitigs } = &self.ev else {
|
||||||
|
panic!("find_exact called on an approximate layer");
|
||||||
|
};
|
||||||
let slot = self.mphf.index(&kmer.raw());
|
let slot = self.mphf.index(&kmer.raw());
|
||||||
if slot >= self.n { return None; }
|
if slot >= self.n { return None; }
|
||||||
let (chunk_id, rank) = self.evidence.decode(slot);
|
let (chunk_id, rank) = evidence.decode(slot);
|
||||||
if self.unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||||
Some(slot)
|
Some(slot)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise.
|
/// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics
|
||||||
///
|
/// if the layer was opened with exact evidence.
|
||||||
/// False positive rate per query: 1/2^b (where b is the fingerprint width
|
|
||||||
/// used at build time). No `.idx` or `evidence.bin` needed at query time.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option<usize> {
|
pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||||
|
let LayerEvidence::Approx { fingerprint } = &self.ev else {
|
||||||
|
panic!("find_approx called on an exact layer");
|
||||||
|
};
|
||||||
let slot = self.mphf.index(&kmer.raw());
|
let slot = self.mphf.index(&kmer.raw());
|
||||||
if slot >= self.n { return None; }
|
if slot >= self.n { return None; }
|
||||||
if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn n(&self) -> usize { self.n }
|
pub fn n(&self) -> usize { self.n }
|
||||||
|
|
||||||
|
// ── Build helpers ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||||
fs::create_dir_all(dir)?;
|
fs::create_dir_all(dir)?;
|
||||||
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
|
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
|
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||||
/// `mphf.bin` already present in `dir`.
|
/// `kind`.
|
||||||
|
pub fn build_evidence(dir: &Path, kind: &EvidenceKind) -> OLMResult<usize> {
|
||||||
|
match kind {
|
||||||
|
EvidenceKind::Exact => Self::build_exact_evidence(dir),
|
||||||
|
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
||||||
///
|
///
|
||||||
/// This is the exact-evidence construction route. It can be called:
|
/// Uses sequential iteration — no `.idx` required on entry.
|
||||||
/// - after the initial build (via [`Self::build`] which calls it internally)
|
|
||||||
/// - standalone to promote an existing (unitigs + mphf) into an exact index
|
|
||||||
/// - standalone to rebuild evidence after a format change
|
|
||||||
/// Build `unitigs.bin.idx` and `evidence.bin` from `unitigs.bin` and
|
|
||||||
/// `mphf.bin` already present in `dir`.
|
|
||||||
///
|
|
||||||
/// Uses sequential iteration — no `.idx` is required on entry.
|
|
||||||
/// Writes both `evidence.bin` and `unitigs.bin.idx` on success.
|
|
||||||
pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
|
pub fn build_exact_evidence(dir: &Path) -> OLMResult<usize> {
|
||||||
let unitig_path = dir.join(UNITIGS_FILE);
|
let unitig_path = dir.join(UNITIGS_FILE);
|
||||||
|
|
||||||
// Sequential scan — no .idx required for iteration
|
|
||||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||||
let n = unitigs.n_kmers();
|
let n = unitigs.n_kmers();
|
||||||
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||||
|
LayerMeta::exact().save(dir)?;
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -117,18 +154,14 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||||
// Write .idx last: it is only needed for random access (queries).
|
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||||
LayerMeta::exact().save(dir)?;
|
LayerMeta::exact().save(dir)?;
|
||||||
Ok(n)
|
Ok(n)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
|
/// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
|
||||||
/// in `dir`. `b` — fingerprint bits (1..=64); `z` — Findere consecutive
|
/// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
|
||||||
/// k-mer parameter (≥1).
|
/// parameter (≥1). No `.idx` is written.
|
||||||
///
|
|
||||||
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
|
|
||||||
/// No `.idx` file is written — approximate evidence needs no random access.
|
|
||||||
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||||
if b == 0 || b > 64 {
|
if b == 0 || b > 64 {
|
||||||
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
||||||
@@ -175,8 +208,6 @@ impl MphfLayer {
|
|||||||
|
|
||||||
let unitig_path = dir.join(UNITIGS_FILE);
|
let unitig_path = dir.join(UNITIGS_FILE);
|
||||||
|
|
||||||
// Write .idx so that UnitigFileReader::open succeeds and parallel
|
|
||||||
// random access is available for MPHF construction.
|
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||||
|
|
||||||
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
let unitigs = UnitigFileReader::open(&unitig_path)?;
|
||||||
@@ -189,10 +220,11 @@ impl MphfLayer {
|
|||||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||||
mphf.store(&dir.join(MPHF_FILE))
|
mphf.store(&dir.join(MPHF_FILE))
|
||||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||||
|
LayerMeta::exact().save(dir)?;
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pass 1 — build MPHF (parallel, uses random access via .idx)
|
// Pass 1 — build MPHF (parallel, random access via .idx)
|
||||||
let keys = (0..unitigs.len())
|
let keys = (0..unitigs.len())
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
||||||
|
|||||||
Reference in New Issue
Block a user