diff --git a/src/obikpartitionner/src/index_layer.rs b/src/obikpartitionner/src/index_layer.rs index 39069aa..022ba95 100644 --- a/src/obikpartitionner/src/index_layer.rs +++ b/src/obikpartitionner/src/index_layer.rs @@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; use obidebruinj::GraphDeBruijn; -use obilayeredmap::{EvidenceKind, OLMError, layer::Layer}; +use obilayeredmap::{IndexMode, OLMError, layer::Layer}; use obilayeredmap::meta::PartitionMeta; use obiskio::{SKError, SKFileMeta, SKFileReader}; use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; @@ -44,7 +44,7 @@ impl KmerPartition { min_ab: u32, max_ab: Option, with_counts: bool, - evidence: &EvidenceKind, + mode: &IndexMode, block_bits: u8, ) -> Result { let part_dir = self.part_dir(i); @@ -110,7 +110,7 @@ impl KmerPartition { uw.close()?; if with_counts { - Layer::::build(&layer_dir, block_bits, evidence, |kmer| { + Layer::::build(&layer_dir, block_bits, mode, |kmer| { match (&mphf1_opt, &counts1_opt) { (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), _ => 1, @@ -118,13 +118,11 @@ impl KmerPartition { }) .map_err(olm_to_sk)?; } else { - Layer::<()>::build(&layer_dir, block_bits, evidence).map_err(olm_to_sk)?; + Layer::<()>::build(&layer_dir, block_bits, mode).map_err(olm_to_sk)?; } - // Write meta.json in the index/ directory so LayeredMap::open works - // (e.g. for subsequent merge operations). let index_dir = layer_dir.parent().expect("layer_dir has a parent"); - PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?; + PartitionMeta { n_layers: 1, mode: mode.clone() }.save(index_dir).map_err(olm_to_sk)?; Ok(n_kmers) } diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index fa6b1be..6107f0b 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntVecBuilder}; use obikseq::CanonicalKmer; use obiskio::{SKError, SKResult, UnitigFileReader}; -use obilayeredmap::{EvidenceKind, Layer, LayeredMap, MphfLayer, OLMError}; +use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; use crate::partition::KmerPartition; @@ -52,18 +52,17 @@ pub(crate) enum SrcLayerData { } impl SrcLayerData { - pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult { + pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult { let presence_dir = layer_dir.join("presence"); let counts_dir = layer_dir.join("counts"); - match mode { + match merge_mode { MergeMode::Presence => { if presence_dir.exists() { - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Presence(mphf, mat)) } else if counts_dir.exists() { - // Source is a count index; treat count > 0 as present via ColBuilder::Bit. - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -72,7 +71,7 @@ impl SrcLayerData { } MergeMode::Count => { if counts_dir.exists() { - let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; + let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; Ok(SrcLayerData::Count(mphf, mat)) } else { @@ -116,7 +115,7 @@ fn load_meta(dir: &Path) -> SKResult { Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => { let mut n = 0usize; while dir.join(format!("layer_{n}")).exists() { n += 1; } - let m = PartitionMeta { n_layers: n }; + let m = PartitionMeta { n_layers: n, mode: IndexMode::default() }; m.save(dir).map_err(olm_to_sk)?; Ok(m) } @@ -217,12 +216,12 @@ impl KmerPartition { uw.write(&unitig)?; } uw.close()?; - Layer::<()>::build(&new_layer_dir, block_bits, &EvidenceKind::Exact).map_err(olm_to_sk)?; + Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?; } drop(g); let new_mphf = if any_new { - Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?) + Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?) } else { None }; @@ -304,7 +303,7 @@ impl KmerPartition { for l in 0..src_meta.n_layers { let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?; - let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { let values = src_data.lookup(kmer, *src_n); diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index 29ca5d5..86ee3f5 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -8,7 +8,7 @@ use obicompactvec::{PersistentBitMatrixBuilder, PersistentCompactIntVecBuilder}; use obidebruinj::GraphDeBruijn; use obiskio::{SKError, SKResult, UnitigFileReader}; -use obilayeredmap::{EvidenceKind, Layer, MphfLayer, OLMError}; +use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError}; use obilayeredmap::meta::PartitionMeta; use crate::filter::KmerFilter; diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index cca2a4f..63b99d2 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -10,7 +10,7 @@ use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; use crate::error::{OLMError, OLMResult}; -use crate::meta::EvidenceKind; +use crate::meta::IndexMode; use crate::mphf_layer::MphfLayer; pub(crate) use crate::mphf_layer::UNITIGS_FILE; @@ -62,8 +62,8 @@ pub struct Hit { // ── Common read path ────────────────────────────────────────────────────────── impl Layer { - pub fn open(path: &Path) -> OLMResult { - let mphf = MphfLayer::open(path)?; + pub fn open(path: &Path, mode: &IndexMode) -> OLMResult { + let mphf = MphfLayer::open(path, mode)?; let data = D::open(path)?; Ok(Self { mphf, data }) } @@ -92,18 +92,13 @@ impl Layer { MphfLayer::build_approx_evidence(layer_dir, b, z) } - /// Dispatch to `build_exact_evidence` or `build_approx_evidence`. - /// `block_bits` is forwarded to exact evidence only. - pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { - MphfLayer::build_evidence(layer_dir, kind, block_bits) - } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── impl Layer<()> { - pub fn build(out_dir: &Path, block_bits: u8, evidence_kind: &EvidenceKind) -> OLMResult { - MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |_, _| Ok(())) + pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode) -> OLMResult { + MphfLayer::build(out_dir, block_bits, mode, &mut |_, _| Ok(())) } /// Create a presence matrix for a set-membership layer (first merge). @@ -126,7 +121,7 @@ impl Layer { pub fn build( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, count_of: impl Fn(CanonicalKmer) -> u32, ) -> OLMResult { let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); @@ -134,7 +129,7 @@ impl Layer { let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) .map_err(OLMError::Io)?; let mut col = mb.add_col().map_err(OLMError::Io)?; - let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| { col.set(slot, count_of(kmer)); Ok(()) })?; @@ -146,10 +141,10 @@ impl Layer { pub fn build_from_map( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, counts: &HashMap, ) -> OLMResult { - Self::build(out_dir, block_bits, evidence_kind, |kmer| counts.get(&kmer).copied().unwrap_or(0)) + Self::build(out_dir, block_bits, mode, |kmer| counts.get(&kmer).copied().unwrap_or(0)) } } @@ -179,7 +174,7 @@ impl Layer { pub fn build_presence( out_dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, n_genomes: usize, present_in: impl Fn(CanonicalKmer, usize) -> bool, ) -> OLMResult { @@ -189,7 +184,7 @@ impl Layer { let mut cols: Vec<_> = (0..n_genomes) .map(|_| mb.add_col().map_err(OLMError::Io)) .collect::>()?; - let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { + let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| { for (g, col) in cols.iter_mut().enumerate() { col.set(slot, present_in(kmer, g)); } diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs index 98ca3c8..9b275a0 100644 --- a/src/obilayeredmap/src/lib.rs +++ b/src/obilayeredmap/src/lib.rs @@ -11,5 +11,5 @@ pub use error::{OLMError, OLMResult}; pub use layer::{Hit, Layer, LayerData}; pub use layered_store::LayeredStore; pub use map::LayeredMap; -pub use meta::{EvidenceKind, LayerMeta}; +pub use meta::{IndexMode, PartitionMeta}; pub use mphf_layer::MphfLayer; diff --git a/src/obilayeredmap/src/map.rs b/src/obilayeredmap/src/map.rs index 391ca99..18d3c55 100644 --- a/src/obilayeredmap/src/map.rs +++ b/src/obilayeredmap/src/map.rs @@ -5,11 +5,10 @@ use std::path::{Path, PathBuf}; use obicompactvec::PersistentCompactIntMatrix; use obikseq::CanonicalKmer; use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS}; -use crate::meta::EvidenceKind; use crate::error::OLMResult; use crate::layer::{Hit, Layer, LayerData}; -use crate::meta::PartitionMeta; +use crate::meta::{IndexMode, PartitionMeta}; /// Layered kmer index for a single partition. /// @@ -17,8 +16,8 @@ use crate::meta::PartitionMeta; /// the first match wins. Adding a dataset appends a new layer without /// rebuilding existing ones. pub struct LayeredMap { - root: PathBuf, - meta: PartitionMeta, + root: PathBuf, + meta: PartitionMeta, layers: Vec>, } @@ -26,39 +25,26 @@ pub struct LayeredMap { impl LayeredMap { /// Open an existing layered index at `root`. + /// The mode is read once from `PartitionMeta` and applied to all layers. pub fn open(root: &Path) -> OLMResult { let meta = PartitionMeta::load(root)?; let layers = (0..meta.n_layers) - .map(|i| Layer::::open(&layer_dir(root, i))) + .map(|i| Layer::::open(&layer_dir(root, i), &meta.mode)) .collect::>>()?; - Ok(Self { - root: root.to_owned(), - meta, - layers, - }) + Ok(Self { root: root.to_owned(), meta, layers }) } - /// Create a new, empty layered index at `root`. - pub fn create(root: &Path) -> OLMResult { + /// Create a new, empty layered index at `root` with the given mode. + pub fn create(root: &Path, mode: IndexMode) -> OLMResult { fs::create_dir_all(root)?; - let meta = PartitionMeta::new(); + let meta = PartitionMeta::new(mode); meta.save(root)?; - Ok(Self { - root: root.to_owned(), - meta, - layers: Vec::new(), - }) + Ok(Self { root: root.to_owned(), meta, layers: Vec::new() }) } - /// Return the number of layers in this index. - pub fn n_layers(&self) -> usize { - self.layers.len() - } - - /// Return a reference to the `i`-th layer. - pub fn layer(&self, i: usize) -> &Layer { - &self.layers[i] - } + pub fn n_layers(&self) -> usize { self.layers.len() } + pub fn layer(&self, i: usize) -> &Layer { &self.layers[i] } + pub fn mode(&self) -> &IndexMode { &self.meta.mode } /// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match. pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit)> { @@ -68,17 +54,15 @@ impl LayeredMap { .find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit))) } - /// Return a `UnitigFileWriter` for the next layer to be built. pub fn next_layer_writer(&self) -> OLMResult { let dir = layer_dir(&self.root, self.layers.len()); Layer::::unitig_writer(&dir) } - /// Append a new layer to the index. fn append_layer(&mut self) -> OLMResult<()> { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - self.layers.push(Layer::::open(&dir)?); + self.layers.push(Layer::::open(&dir, &self.meta.mode)?); self.meta.n_layers = self.layers.len(); self.meta.save(&self.root)?; Ok(()) @@ -91,7 +75,7 @@ impl LayeredMap<()> { pub fn push_layer(&mut self) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact)?; + Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode)?; self.append_layer()?; Ok(i) } @@ -103,15 +87,12 @@ impl LayeredMap { pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult { let i = self.layers.len(); let dir = layer_dir(&self.root, i); - Layer::::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, count_of)?; + Layer::::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode, count_of)?; self.append_layer()?; Ok(i) } - pub fn push_layer_from_map( - &mut self, - counts: &HashMap, - ) -> OLMResult { + pub fn push_layer_from_map(&mut self, counts: &HashMap) -> OLMResult { self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0)) } } diff --git a/src/obilayeredmap/src/meta.rs b/src/obilayeredmap/src/meta.rs index 352cbdd..ed567d3 100644 --- a/src/obilayeredmap/src/meta.rs +++ b/src/obilayeredmap/src/meta.rs @@ -5,65 +5,45 @@ use serde::{Deserialize, Serialize}; use crate::error::OLMResult; -const META_FILE: &str = "meta.json"; -const LAYER_META_FILE: &str = "layer_meta.json"; +const META_FILE: &str = "meta.json"; -// ── Layer-level metadata ────────────────────────────────────────────────────── +// ── IndexMode ───────────────────────────────────────────────────────────────── -/// Describes the evidence bundle stored alongside the MPHF for one layer. +/// Evidence mode for an entire partitioned index — homogeneous across all layers. +/// +/// Determined once at build time; stored in `PartitionMeta` (`meta.json`). +/// All layers within an index share the same mode. #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(tag = "type", rename_all = "snake_case")] -pub enum EvidenceKind { +pub enum IndexMode { /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives. Exact, /// Approximate evidence: `fingerprint.bin` only. - /// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b. - /// `z` — consecutive k-mers that must all match (Findere trick); - /// effective FP rate per sequencing read ≈ W / 2^(b·z) - /// where W = L - k - z + 2 is the number of windows in a read of length L. + /// `b` — fingerprint bits per slot; false-positive rate ≈ 1/2^b per query. + /// `z` — Findere consecutive-kmer parameter (build-time only; not used at query time). Approx { b: u8, z: u8 }, + /// Hybrid: both `fingerprint.bin` and `evidence.bin` + `unitigs.bin.idx`. + /// `find()` uses the fingerprint (O(1), approx); `find_strict()` uses exact evidence. + Hybrid { b: u8, z: u8 }, } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LayerMeta { - pub evidence: EvidenceKind, -} - -impl Default for EvidenceKind { +impl Default for IndexMode { fn default() -> Self { Self::Exact } } -impl LayerMeta { - pub fn exact() -> Self { - Self { evidence: EvidenceKind::Exact } - } - - pub fn approx(b: u8, z: u8) -> Self { - Self { evidence: EvidenceKind::Approx { b, z } } - } - - pub fn load(layer_dir: &Path) -> OLMResult { - let f = File::open(layer_dir.join(LAYER_META_FILE))?; - Ok(serde_json::from_reader(f)?) - } - - pub fn save(&self, layer_dir: &Path) -> OLMResult<()> { - let f = File::create(layer_dir.join(LAYER_META_FILE))?; - serde_json::to_writer_pretty(f, self)?; - Ok(()) - } -} - -// ── Partition-level metadata ────────────────────────────────────────────────── +// ── PartitionMeta ───────────────────────────────────────────────────────────── +/// Index-level metadata stored in `meta.json` at the root of a partition index. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PartitionMeta { pub n_layers: usize, + #[serde(default)] + pub mode: IndexMode, } impl PartitionMeta { - pub fn new() -> Self { - Self { n_layers: 0 } + pub fn new(mode: IndexMode) -> Self { + Self { n_layers: 0, mode } } pub fn load(dir: &Path) -> OLMResult { @@ -79,5 +59,5 @@ impl PartitionMeta { } impl Default for PartitionMeta { - fn default() -> Self { Self::new() } + fn default() -> Self { Self::new(IndexMode::Exact) } } diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index c94ce69..877e564 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -1,5 +1,5 @@ use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; use cacheline_ef::{CachelineEf, CachelineEfVec}; use epserde::prelude::*; @@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; -use crate::meta::{EvidenceKind, LayerMeta}; +use crate::meta::IndexMode; pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; @@ -19,19 +19,22 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin"; pub(crate) type Mphf = PtrHash>, Xx64, Vec>; -// ── Evidence store ──────────────────────────────────────────────────────────── +// ── LayerEvidence ───────────────────────────────────────────────────────────── enum LayerEvidence { - Exact { evidence: Evidence, unitigs: UnitigFileReader }, - Approx { fingerprint: FingerprintVec }, + Exact { evidence: Evidence, unitigs: UnitigFileReader }, + Approx { fingerprint: FingerprintVec, unitigs_path: PathBuf }, + Hybrid { evidence: Evidence, unitigs: UnitigFileReader, fingerprint: FingerprintVec }, } // ── MphfLayer ───────────────────────────────────────────────────────────────── /// Autonomous kmer → slot mapping for one layer. /// -/// Dispatches queries to exact or approximate evidence transparently based on -/// the `layer_meta.json` written at build time. +/// Two query methods: +/// - [`find`](Self::find) — O(1), uses fingerprint (Approx/Hybrid) or exact evidence (Exact). +/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers, +/// O(n) sequential scan on Approx layers. pub struct MphfLayer { mphf: Mphf, ev: LayerEvidence, @@ -39,21 +42,31 @@ pub struct MphfLayer { } impl MphfLayer { - pub fn open(dir: &Path) -> OLMResult { - let meta = LayerMeta::load(dir)?; + /// Open a layer using the index-level `mode` determined at `LayeredMap` open time. + /// No per-layer metadata file is read. + pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult { let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - let (ev, n) = match meta.evidence { - EvidenceKind::Exact => { + let (ev, n) = match mode { + IndexMode::Exact => { let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; let n = evidence.len(); + // open() auto-detects: uses direct access since exact layers always have .idx let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; (LayerEvidence::Exact { evidence, unitigs }, n) } - EvidenceKind::Approx { .. } => { + IndexMode::Approx { .. } => { let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; let n = fingerprint.n(); - (LayerEvidence::Approx { fingerprint }, n) + let unitigs_path = dir.join(UNITIGS_FILE); + (LayerEvidence::Approx { fingerprint, unitigs_path }, n) + } + IndexMode::Hybrid { .. } => { + let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; + let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; + let n = evidence.len(); + let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; + (LayerEvidence::Hybrid { evidence, unitigs, fingerprint }, n) } }; Ok(Self { mphf, ev, n }) @@ -61,45 +74,60 @@ impl MphfLayer { // ── Query API ───────────────────────────────────────────────────────────── - /// Transparent dispatch: routes to `find_exact` or `find_approx` based on - /// the evidence loaded at `open` time. + /// O(1) lookup — dispatches automatically: + /// - Exact: evidence + `verify_canonical_kmer`, zero false positives. + /// - Approx: fingerprint check, false-positive rate ≈ 1/2^b. + /// - Hybrid: fingerprint check (fast path), zero false positives via `find_strict`. #[inline] pub fn find(&self, kmer: CanonicalKmer) -> Option { + let slot = self.mphf.index(&kmer.raw()); + if slot >= self.n { return None; } match &self.ev { - LayerEvidence::Exact { .. } => self.find_exact(kmer), - LayerEvidence::Approx { .. } => self.find_approx(kmer), + LayerEvidence::Exact { evidence, unitigs } => { + let (chunk_id, rank) = evidence.decode(slot); + if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + Some(slot) + } else { + None + } + } + LayerEvidence::Approx { fingerprint, .. } | + LayerEvidence::Hybrid { fingerprint, .. } => { + if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } + } } } - /// Exact lookup: zero false positives. Panics if the layer was opened with - /// approximate evidence. - #[inline] - pub fn find_exact(&self, kmer: CanonicalKmer) -> Option { - let LayerEvidence::Exact { evidence, unitigs } = &self.ev else { - panic!("find_exact called on an approximate layer"); - }; + /// Always-exact lookup — zero false positives regardless of mode. + /// + /// - Exact/Hybrid: O(1) via evidence + `verify_canonical_kmer`. + /// - Approx: O(n) sequential scan of `unitigs.bin` to confirm the kmer + /// that owns the slot, then exact comparison. + pub fn find_strict(&self, kmer: CanonicalKmer) -> Option { let slot = self.mphf.index(&kmer.raw()); if slot >= self.n { return None; } - let (chunk_id, rank) = evidence.decode(slot); - if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { - Some(slot) - } else { - None + match &self.ev { + LayerEvidence::Exact { evidence, unitigs } | + LayerEvidence::Hybrid { evidence, unitigs, .. } => { + let (chunk_id, rank) = evidence.decode(slot); + if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { + Some(slot) + } else { + None + } + } + LayerEvidence::Approx { unitigs_path, .. } => { + let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?; + for stored in reader.iter_canonical_kmers() { + if self.mphf.index(&stored.raw()) == slot { + return if stored == kmer { Some(slot) } else { None }; + } + } + None + } } } - /// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics - /// if the layer was opened with exact evidence. - #[inline] - pub fn find_approx(&self, kmer: CanonicalKmer) -> Option { - let LayerEvidence::Approx { fingerprint } = &self.ev else { - panic!("find_approx called on an exact layer"); - }; - let slot = self.mphf.index(&kmer.raw()); - if slot >= self.n { return None; } - if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } - } - pub fn n(&self) -> usize { self.n } // ── Build helpers ───────────────────────────────────────────────────────── @@ -109,19 +137,7 @@ impl MphfLayer { Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?) } - /// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on - /// `kind`. `block_bits` is forwarded to exact evidence only. - pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult { - match kind { - EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits), - EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z), - } - } - /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`. - /// - /// `block_bits` controls the `.idx` block size (2^block_bits chunks per block). - /// Uses sequential iteration — no `.idx` required on entry. pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult { let unitig_path = dir.join(UNITIGS_FILE); let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; @@ -130,7 +146,6 @@ impl MphfLayer { if n == 0 { fs::File::create(dir.join(EVIDENCE_FILE))?; build_unitig_idx(&unitig_path, block_bits)?; - LayerMeta::exact().save(dir)?; return Ok(0); } @@ -156,13 +171,10 @@ impl MphfLayer { ev.write(&dir.join(EVIDENCE_FILE))?; build_unitig_idx(&unitig_path, block_bits)?; - LayerMeta::exact().save(dir)?; Ok(n) } /// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`. - /// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer - /// parameter (≥1). No `.idx` is written. pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult { if b == 0 || b > 64 { return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); @@ -176,7 +188,6 @@ impl MphfLayer { if n == 0 { FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(b, z).save(dir)?; return Ok(0); } @@ -194,139 +205,113 @@ impl MphfLayer { } fw.write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(b, z).save(dir)?; Ok(n) } - /// Build MPHF then evidence from the unitigs file already present in `dir`. + /// Build MPHF + evidence from `unitigs.bin` already present in `dir`. /// - /// - Exact: `.idx` is built for pass-1 parallel construction and kept for - /// query-time kmer verification. `evidence.bin` is written. - /// - Approx: pass-1 uses `open_sequential` + `par_bridge` — no `.idx` is - /// ever created. `fingerprint.bin` is written. - /// - /// `fill_slot(slot, kmer)` is called once per kmer in both modes. + /// `fill_slot(slot, kmer)` is called once per kmer in all modes. + /// No `layer_meta.json` is written — the mode is an index-level property + /// stored in `PartitionMeta`. pub(crate) fn build( dir: &Path, block_bits: u8, - evidence_kind: &EvidenceKind, + mode: &IndexMode, fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>, ) -> OLMResult { use rayon::prelude::*; let unitig_path = dir.join(UNITIGS_FILE); + let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers(); - match evidence_kind { - // ── Exact path ──────────────────────────────────────────────────── - // .idx is built LAST, once evidence.bin is written, so it is never - // present during construction — only at query time. - EvidenceKind::Exact => { - let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers(); - let keys = CanonicalKmerIter::new(&unitig_path) - .map_err(|e| match e { - obiskio::SKError::Io(io) => OLMError::Io(io), - e => OLMError::InvalidLayer(e.to_string()), - })?; + let sk_to_olm = |e: obiskio::SKError| match e { + obiskio::SKError::Io(io) => OLMError::Io(io), + e => OLMError::InvalidLayer(e.to_string()), + }; - if n == 0 { + // ── Empty layer ─────────────────────────────────────────────────────── + if n == 0 { + let mphf: Mphf = + Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) + .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; + mphf.store(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + match mode { + IndexMode::Exact | IndexMode::Hybrid { .. } => { fs::File::create(dir.join(EVIDENCE_FILE))?; - let mphf: Mphf = - Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) - .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - LayerMeta::exact().save(dir)?; build_unitig_idx(&unitig_path, block_bits)?; - return Ok(0); } + IndexMode::Approx { b, .. } => { + FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; + } + } + if let IndexMode::Hybrid { b, .. } = mode { + FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; + } + return Ok(0); + } - // Pass 1 — MPHF construction via clonable mmap iterator - let mphf: Mphf = - Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::::default()); - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + // ── Pass 1: MPHF via clonable mmap iterator ─────────────────────────── + let keys = CanonicalKmerIter::new(&unitig_path).map_err(sk_to_olm)?; + let mphf: Mphf = + Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), + PtrHashParams::::default()); + mphf.store(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - // Pass 2 — sequential: fill evidence.bin + callback - let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; - let mut ev = EvidenceWriter::new(n); - let mut seen = vec![0u8; (n + 7) / 8]; + // ── Pass 2: fill evidence files + callback ──────────────────────────── + let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; + let mut seen = vec![0u8; (n + 7) / 8]; + match mode { + IndexMode::Exact => { + let mut ev = EvidenceWriter::new(n); for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() { let slot = mphf.index(&kmer.raw()); - if slot >= n { - return Err(OLMError::Mphf("slot out of bounds".into())); - } - let byte = slot / 8; - let bit = 1u8 << (slot % 8); - if seen[byte] & bit != 0 { - return Err(OLMError::Mphf("duplicate slot".into())); - } + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } seen[byte] |= bit; ev.set(slot, chunk_id as u32, rank as u8); fill_slot(slot, kmer)?; } - ev.write(&dir.join(EVIDENCE_FILE))?; - LayerMeta::exact().save(dir)?; - // .idx built last: strictly for query-time kmer verification build_unitig_idx(&unitig_path, block_bits)?; - Ok(n) } - // ── Approx path ─────────────────────────────────────────────────── - // No .idx is created at any point. - EvidenceKind::Approx { b, z } => { - let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; - let n = unitigs.n_kmers(); - - if n == 0 { - FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?; - let mphf: Mphf = - Mphf::try_new(&[] as &[u64], PtrHashParams::::default()) - .ok_or_else(|| OLMError::Mphf("construction failed".into()))?; - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - LayerMeta::approx(*b, *z).save(dir)?; - return Ok(0); - } - - // Pass 1 — MPHF construction via mmap-backed clonable iterator. - // No .idx is created. par_bridge() parallelises the sequential scan; - // Clone on CanonicalKmerRawIter shares the Arc and resets to pos 0. - let keys = CanonicalKmerIter::new(&unitig_path) - .map_err(|e| match e { - obiskio::SKError::Io(io) => OLMError::Io(io), - e => OLMError::InvalidLayer(e.to_string()), - })?; - let mphf: Mphf = - Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::::default()); - mphf.store(&dir.join(MPHF_FILE)) - .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; - - // Pass 2 — sequential: fill fingerprint.bin + callback - let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; - let mut fw = FingerprintVecWriter::new(n, *b); - let mut seen = vec![0u8; (n + 7) / 8]; - + IndexMode::Approx { b, .. } => { + let mut fw = FingerprintVecWriter::new(n, *b); for kmer in unitigs2.iter_canonical_kmers() { let slot = mphf.index(&kmer.raw()); - if slot >= n { - return Err(OLMError::Mphf("slot out of bounds".into())); - } - let byte = slot / 8; - let bit = 1u8 << (slot % 8); - if seen[byte] & bit != 0 { - return Err(OLMError::Mphf("duplicate slot".into())); - } + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } seen[byte] |= bit; fw.set(slot, kmer.seq_hash()); fill_slot(slot, kmer)?; } - fw.write(&dir.join(FINGERPRINT_FILE))?; - LayerMeta::approx(*b, *z).save(dir)?; - Ok(n) + } + + IndexMode::Hybrid { b, .. } => { + let mut ev = EvidenceWriter::new(n); + let mut fw = FingerprintVecWriter::new(n, *b); + for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() { + let slot = mphf.index(&kmer.raw()); + if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); } + let byte = slot / 8; let bit = 1u8 << (slot % 8); + if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); } + seen[byte] |= bit; + ev.set(slot, chunk_id as u32, rank as u8); + fw.set(slot, kmer.seq_hash()); + fill_slot(slot, kmer)?; + } + ev.write(&dir.join(EVIDENCE_FILE))?; + fw.write(&dir.join(FINGERPRINT_FILE))?; + build_unitig_idx(&unitig_path, block_bits)?; } } + + Ok(n) } } diff --git a/src/obiskio/src/unitig_index.rs b/src/obiskio/src/unitig_index.rs index f5bf310..fd46892 100644 --- a/src/obiskio/src/unitig_index.rs +++ b/src/obiskio/src/unitig_index.rs @@ -198,11 +198,16 @@ pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> { // ── Reader ──────────────────────────────────────────────────────────────────── -/// Read-only random-access view of a unitig file. +/// Memory-mapped view of a unitig file, with optional direct-access index. /// -/// The sequence file is memory-mapped; the block offset table is loaded into RAM -/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap -/// reads. Sequential iteration: O(n) via a running-offset cursor. +/// Three constructors select the operating mode: +/// - [`open`](Self::open) — smart default: direct access if `.idx` exists, sequential otherwise. +/// - [`open_sequential`](Self::open_sequential) — always sequential, ignores `.idx`. +/// - [`open_direct_access`](Self::open_direct_access) — requires `.idx`, errors if absent. +/// +/// All positional methods (`chunk_start`, `verify_canonical_kmer`, …) work in +/// both modes. Without `.idx` they fall back to an O(i) sequential scan — +/// correct but slower. pub struct UnitigFileReader { mmap: Mmap, block_offsets: Vec, @@ -214,8 +219,52 @@ pub struct UnitigFileReader { } impl UnitigFileReader { - /// Open with `.idx` — enables both sequential iteration and random access. + /// Smart default: opens with direct access if `.idx` is present, sequential otherwise. pub fn open(path: &Path) -> SKResult { + if idx_path(path).exists() { + Self::open_direct_access(path) + } else { + Self::open_sequential(path) + } + } + + /// Always sequential — never reads `.idx` even if present. + /// + /// Scans the binary file once to count chunks and k-mers. + /// Positional access (`chunk_start`, `verify_canonical_kmer`) falls back to + /// O(i) sequential scan. + pub fn open_sequential(path: &Path) -> SKResult { + let file = File::open(path).map_err(SKError::Io)?; + let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; + let k = obikseq::params::k(); + + let mut offset = 0usize; + let mut n_unitigs = 0usize; + let mut n_kmers = 0usize; + while offset < mmap.len() { + let seql_minus_k = mmap[offset] as usize; + n_kmers += seql_minus_k + 1; + offset += 1 + (seql_minus_k + k + 3) / 4; + n_unitigs += 1; + } + + Ok(Self { + mmap, + block_offsets: Vec::new(), + n_unitigs, + n_kmers, + k, + block_bits: DEFAULT_BLOCK_BITS, + mask: (1usize << DEFAULT_BLOCK_BITS) - 1, + }) + } + + /// Requires `.idx` — errors if the companion index file is absent. + /// + /// Enables O(1 << block_bits) positional access to any chunk. + /// Use only when direct access is architecturally required (query-time + /// verification on an exact-evidence layer). + pub fn open_direct_access(path: &Path) -> SKResult { let file = File::open(path).map_err(SKError::Io)?; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?; @@ -231,58 +280,38 @@ impl UnitigFileReader { }) } - /// Open without `.idx` — sequential iteration only, no random access. - /// - /// Scans the binary file once to count chunks and k-mers. Use when only - /// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or - /// [`Self::iter_indexed_canonical_kmers`] are needed. - pub fn open_sequential(path: &Path) -> SKResult { - let file = File::open(path).map_err(SKError::Io)?; - let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; - let k = obikseq::params::k(); - - let mut offset = 0usize; - let mut n_unitigs = 0usize; - let mut n_kmers = 0usize; - while offset < mmap.len() { - let seql_minus_k = mmap[offset] as usize; - n_kmers += seql_minus_k + 1; - offset += 1 + (seql_minus_k + k + 3) / 4; - n_unitigs += 1; - } - - Ok(Self { - mmap, - block_offsets: Vec::new(), // empty → random access disabled - n_unitigs, - n_kmers, - k, - block_bits: DEFAULT_BLOCK_BITS, - mask: (1usize << DEFAULT_BLOCK_BITS) - 1, - }) - } - pub fn len(&self) -> usize { self.n_unitigs } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn n_kmers(&self) -> usize { self.n_kmers } pub fn block_bits(&self) -> u8 { self.block_bits } + pub fn has_direct_access(&self) -> bool { !self.block_offsets.is_empty() } - /// Byte offset of the START of record `i` (the seql byte) in the mmap. + /// Byte offset of record `i` in the mmap. + /// + /// Fast path (O(1 << block_bits)) when `.idx` is loaded; degraded O(i) + /// sequential scan otherwise. #[inline] fn chunk_start(&self, i: usize) -> usize { - assert!(!self.block_offsets.is_empty(), - "random access requires UnitigFileReader::open(); use open_sequential() for iteration only"); - if self.block_bits == 0 { - return self.block_offsets[i] as usize; + if !self.block_offsets.is_empty() { + if self.block_bits == 0 { + return self.block_offsets[i] as usize; + } + let block = i >> self.block_bits; + let rem = i & self.mask; + let mut offset = self.block_offsets[block] as usize; + for _ in 0..rem { + let seql_minus_k = self.mmap[offset] as usize; + offset += 1 + (seql_minus_k + self.k + 3) / 4; + } + offset + } else { + let mut offset = 0usize; + for _ in 0..i { + let seql_minus_k = self.mmap[offset] as usize; + offset += 1 + (seql_minus_k + self.k + 3) / 4; + } + offset } - let block = i >> self.block_bits; - let rem = i & self.mask; - let mut offset = self.block_offsets[block] as usize; - for _ in 0..rem { - let seql_minus_k = self.mmap[offset] as usize; - offset += 1 + (seql_minus_k + self.k + 3) / 4; - } - offset } /// Nucleotide length of chunk `i`. @@ -307,7 +336,9 @@ impl UnitigFileReader { extract_kmer_raw(&self.mmap[offset + 1..], j, self.k) } - /// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical). + /// `true` iff the k-mer at position `j` of chunk `i` matches `query`. + /// + /// Works in both modes; O(i) scan when `.idx` is absent. #[inline] pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool { canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()