refactor: switch indexing to IndexMode and update metadata
Replace EvidenceKind with IndexMode (Exact, Approx, Hybrid) across layer construction and query dispatch. Update PartitionMeta and LayerMeta serialization to centralize index-wide configuration. Add flexible push_layer overloads to LayeredMap for dynamic index expansion without full rebuilds. Improve UnitigFileReader to gracefully fallback to sequential scanning when indexes are missing, eliminating panics.
This commit is contained in:
@@ -10,7 +10,7 @@ use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
use crate::meta::EvidenceKind;
|
||||
use crate::meta::IndexMode;
|
||||
use crate::mphf_layer::MphfLayer;
|
||||
pub(crate) use crate::mphf_layer::UNITIGS_FILE;
|
||||
|
||||
@@ -62,8 +62,8 @@ pub struct Hit<T = ()> {
|
||||
// ── Common read path ──────────────────────────────────────────────────────────
|
||||
|
||||
impl<D: LayerData> Layer<D> {
|
||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||
let mphf = MphfLayer::open(path)?;
|
||||
pub fn open(path: &Path, mode: &IndexMode) -> OLMResult<Self> {
|
||||
let mphf = MphfLayer::open(path, mode)?;
|
||||
let data = D::open(path)?;
|
||||
Ok(Self { mphf, data })
|
||||
}
|
||||
@@ -92,18 +92,13 @@ impl<D: LayerData> Layer<D> {
|
||||
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
||||
}
|
||||
|
||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence`.
|
||||
/// `block_bits` is forwarded to exact evidence only.
|
||||
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build_evidence(layer_dir, kind, block_bits)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||
|
||||
impl Layer<()> {
|
||||
pub fn build(out_dir: &Path, block_bits: u8, evidence_kind: &EvidenceKind) -> OLMResult<usize> {
|
||||
MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |_, _| Ok(()))
|
||||
pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode) -> OLMResult<usize> {
|
||||
MphfLayer::build(out_dir, block_bits, mode, &mut |_, _| Ok(()))
|
||||
}
|
||||
|
||||
/// Create a presence matrix for a set-membership layer (first merge).
|
||||
@@ -126,7 +121,7 @@ impl Layer<PersistentCompactIntMatrix> {
|
||||
pub fn build(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
evidence_kind: &EvidenceKind,
|
||||
mode: &IndexMode,
|
||||
count_of: impl Fn(CanonicalKmer) -> u32,
|
||||
) -> OLMResult<usize> {
|
||||
let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers();
|
||||
@@ -134,7 +129,7 @@ impl Layer<PersistentCompactIntMatrix> {
|
||||
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
|
||||
.map_err(OLMError::Io)?;
|
||||
let mut col = mb.add_col().map_err(OLMError::Io)?;
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| {
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| {
|
||||
col.set(slot, count_of(kmer));
|
||||
Ok(())
|
||||
})?;
|
||||
@@ -146,10 +141,10 @@ impl Layer<PersistentCompactIntMatrix> {
|
||||
pub fn build_from_map(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
evidence_kind: &EvidenceKind,
|
||||
mode: &IndexMode,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
) -> OLMResult<usize> {
|
||||
Self::build(out_dir, block_bits, evidence_kind, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
Self::build(out_dir, block_bits, mode, |kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,7 +174,7 @@ impl Layer<PersistentBitMatrix> {
|
||||
pub fn build_presence(
|
||||
out_dir: &Path,
|
||||
block_bits: u8,
|
||||
evidence_kind: &EvidenceKind,
|
||||
mode: &IndexMode,
|
||||
n_genomes: usize,
|
||||
present_in: impl Fn(CanonicalKmer, usize) -> bool,
|
||||
) -> OLMResult<usize> {
|
||||
@@ -189,7 +184,7 @@ impl Layer<PersistentBitMatrix> {
|
||||
let mut cols: Vec<_> = (0..n_genomes)
|
||||
.map(|_| mb.add_col().map_err(OLMError::Io))
|
||||
.collect::<OLMResult<_>>()?;
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| {
|
||||
let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| {
|
||||
for (g, col) in cols.iter_mut().enumerate() {
|
||||
col.set(slot, present_in(kmer, g));
|
||||
}
|
||||
|
||||
@@ -11,5 +11,5 @@ pub use error::{OLMError, OLMResult};
|
||||
pub use layer::{Hit, Layer, LayerData};
|
||||
pub use layered_store::LayeredStore;
|
||||
pub use map::LayeredMap;
|
||||
pub use meta::{EvidenceKind, LayerMeta};
|
||||
pub use meta::{IndexMode, PartitionMeta};
|
||||
pub use mphf_layer::MphfLayer;
|
||||
|
||||
@@ -5,11 +5,10 @@ use std::path::{Path, PathBuf};
|
||||
use obicompactvec::PersistentCompactIntMatrix;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS};
|
||||
use crate::meta::EvidenceKind;
|
||||
|
||||
use crate::error::OLMResult;
|
||||
use crate::layer::{Hit, Layer, LayerData};
|
||||
use crate::meta::PartitionMeta;
|
||||
use crate::meta::{IndexMode, PartitionMeta};
|
||||
|
||||
/// Layered kmer index for a single partition.
|
||||
///
|
||||
@@ -17,8 +16,8 @@ use crate::meta::PartitionMeta;
|
||||
/// the first match wins. Adding a dataset appends a new layer without
|
||||
/// rebuilding existing ones.
|
||||
pub struct LayeredMap<D: LayerData = ()> {
|
||||
root: PathBuf,
|
||||
meta: PartitionMeta,
|
||||
root: PathBuf,
|
||||
meta: PartitionMeta,
|
||||
layers: Vec<Layer<D>>,
|
||||
}
|
||||
|
||||
@@ -26,39 +25,26 @@ pub struct LayeredMap<D: LayerData = ()> {
|
||||
|
||||
impl<D: LayerData> LayeredMap<D> {
|
||||
/// Open an existing layered index at `root`.
|
||||
/// The mode is read once from `PartitionMeta` and applied to all layers.
|
||||
pub fn open(root: &Path) -> OLMResult<Self> {
|
||||
let meta = PartitionMeta::load(root)?;
|
||||
let layers = (0..meta.n_layers)
|
||||
.map(|i| Layer::<D>::open(&layer_dir(root, i)))
|
||||
.map(|i| Layer::<D>::open(&layer_dir(root, i), &meta.mode))
|
||||
.collect::<OLMResult<Vec<_>>>()?;
|
||||
Ok(Self {
|
||||
root: root.to_owned(),
|
||||
meta,
|
||||
layers,
|
||||
})
|
||||
Ok(Self { root: root.to_owned(), meta, layers })
|
||||
}
|
||||
|
||||
/// Create a new, empty layered index at `root`.
|
||||
pub fn create(root: &Path) -> OLMResult<Self> {
|
||||
/// Create a new, empty layered index at `root` with the given mode.
|
||||
pub fn create(root: &Path, mode: IndexMode) -> OLMResult<Self> {
|
||||
fs::create_dir_all(root)?;
|
||||
let meta = PartitionMeta::new();
|
||||
let meta = PartitionMeta::new(mode);
|
||||
meta.save(root)?;
|
||||
Ok(Self {
|
||||
root: root.to_owned(),
|
||||
meta,
|
||||
layers: Vec::new(),
|
||||
})
|
||||
Ok(Self { root: root.to_owned(), meta, layers: Vec::new() })
|
||||
}
|
||||
|
||||
/// Return the number of layers in this index.
|
||||
pub fn n_layers(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
|
||||
/// Return a reference to the `i`-th layer.
|
||||
pub fn layer(&self, i: usize) -> &Layer<D> {
|
||||
&self.layers[i]
|
||||
}
|
||||
pub fn n_layers(&self) -> usize { self.layers.len() }
|
||||
pub fn layer(&self, i: usize) -> &Layer<D> { &self.layers[i] }
|
||||
pub fn mode(&self) -> &IndexMode { &self.meta.mode }
|
||||
|
||||
/// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match.
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit<D::Item>)> {
|
||||
@@ -68,17 +54,15 @@ impl<D: LayerData> LayeredMap<D> {
|
||||
.find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit)))
|
||||
}
|
||||
|
||||
/// Return a `UnitigFileWriter` for the next layer to be built.
|
||||
pub fn next_layer_writer(&self) -> OLMResult<UnitigFileWriter> {
|
||||
let dir = layer_dir(&self.root, self.layers.len());
|
||||
Layer::<D>::unitig_writer(&dir)
|
||||
}
|
||||
|
||||
/// Append a new layer to the index.
|
||||
fn append_layer(&mut self) -> OLMResult<()> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
self.layers.push(Layer::<D>::open(&dir)?);
|
||||
self.layers.push(Layer::<D>::open(&dir, &self.meta.mode)?);
|
||||
self.meta.n_layers = self.layers.len();
|
||||
self.meta.save(&self.root)?;
|
||||
Ok(())
|
||||
@@ -91,7 +75,7 @@ impl LayeredMap<()> {
|
||||
pub fn push_layer(&mut self) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact)?;
|
||||
Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
@@ -103,15 +87,12 @@ impl LayeredMap<PersistentCompactIntMatrix> {
|
||||
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
|
||||
let i = self.layers.len();
|
||||
let dir = layer_dir(&self.root, i);
|
||||
Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, count_of)?;
|
||||
Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode, count_of)?;
|
||||
self.append_layer()?;
|
||||
Ok(i)
|
||||
}
|
||||
|
||||
pub fn push_layer_from_map(
|
||||
&mut self,
|
||||
counts: &HashMap<CanonicalKmer, u32>,
|
||||
) -> OLMResult<usize> {
|
||||
pub fn push_layer_from_map(&mut self, counts: &HashMap<CanonicalKmer, u32>) -> OLMResult<usize> {
|
||||
self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,65 +5,45 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::OLMResult;
|
||||
|
||||
const META_FILE: &str = "meta.json";
|
||||
const LAYER_META_FILE: &str = "layer_meta.json";
|
||||
const META_FILE: &str = "meta.json";
|
||||
|
||||
// ── Layer-level metadata ──────────────────────────────────────────────────────
|
||||
// ── IndexMode ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Describes the evidence bundle stored alongside the MPHF for one layer.
|
||||
/// Evidence mode for an entire partitioned index — homogeneous across all layers.
|
||||
///
|
||||
/// Determined once at build time; stored in `PartitionMeta` (`meta.json`).
|
||||
/// All layers within an index share the same mode.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum EvidenceKind {
|
||||
pub enum IndexMode {
|
||||
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
|
||||
Exact,
|
||||
/// Approximate evidence: `fingerprint.bin` only.
|
||||
/// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b.
|
||||
/// `z` — consecutive k-mers that must all match (Findere trick);
|
||||
/// effective FP rate per sequencing read ≈ W / 2^(b·z)
|
||||
/// where W = L - k - z + 2 is the number of windows in a read of length L.
|
||||
/// `b` — fingerprint bits per slot; false-positive rate ≈ 1/2^b per query.
|
||||
/// `z` — Findere consecutive-kmer parameter (build-time only; not used at query time).
|
||||
Approx { b: u8, z: u8 },
|
||||
/// Hybrid: both `fingerprint.bin` and `evidence.bin` + `unitigs.bin.idx`.
|
||||
/// `find()` uses the fingerprint (O(1), approx); `find_strict()` uses exact evidence.
|
||||
Hybrid { b: u8, z: u8 },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct LayerMeta {
|
||||
pub evidence: EvidenceKind,
|
||||
}
|
||||
|
||||
impl Default for EvidenceKind {
|
||||
impl Default for IndexMode {
|
||||
fn default() -> Self { Self::Exact }
|
||||
}
|
||||
|
||||
impl LayerMeta {
|
||||
pub fn exact() -> Self {
|
||||
Self { evidence: EvidenceKind::Exact }
|
||||
}
|
||||
|
||||
pub fn approx(b: u8, z: u8) -> Self {
|
||||
Self { evidence: EvidenceKind::Approx { b, z } }
|
||||
}
|
||||
|
||||
pub fn load(layer_dir: &Path) -> OLMResult<Self> {
|
||||
let f = File::open(layer_dir.join(LAYER_META_FILE))?;
|
||||
Ok(serde_json::from_reader(f)?)
|
||||
}
|
||||
|
||||
pub fn save(&self, layer_dir: &Path) -> OLMResult<()> {
|
||||
let f = File::create(layer_dir.join(LAYER_META_FILE))?;
|
||||
serde_json::to_writer_pretty(f, self)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// ── Partition-level metadata ──────────────────────────────────────────────────
|
||||
// ── PartitionMeta ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// Index-level metadata stored in `meta.json` at the root of a partition index.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct PartitionMeta {
|
||||
pub n_layers: usize,
|
||||
#[serde(default)]
|
||||
pub mode: IndexMode,
|
||||
}
|
||||
|
||||
impl PartitionMeta {
|
||||
pub fn new() -> Self {
|
||||
Self { n_layers: 0 }
|
||||
pub fn new(mode: IndexMode) -> Self {
|
||||
Self { n_layers: 0, mode }
|
||||
}
|
||||
|
||||
pub fn load(dir: &Path) -> OLMResult<Self> {
|
||||
@@ -79,5 +59,5 @@ impl PartitionMeta {
|
||||
}
|
||||
|
||||
impl Default for PartitionMeta {
|
||||
fn default() -> Self { Self::new() }
|
||||
fn default() -> Self { Self::new(IndexMode::Exact) }
|
||||
}
|
||||
|
||||
+138
-153
@@ -1,5 +1,5 @@
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
@@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
use crate::evidence::{Evidence, EvidenceWriter};
|
||||
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
||||
use crate::meta::{EvidenceKind, LayerMeta};
|
||||
use crate::meta::IndexMode;
|
||||
|
||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
@@ -19,19 +19,22 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
||||
|
||||
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
// ── Evidence store ────────────────────────────────────────────────────────────
|
||||
// ── LayerEvidence ─────────────────────────────────────────────────────────────
|
||||
|
||||
enum LayerEvidence {
|
||||
Exact { evidence: Evidence, unitigs: UnitigFileReader },
|
||||
Approx { fingerprint: FingerprintVec },
|
||||
Exact { evidence: Evidence, unitigs: UnitigFileReader },
|
||||
Approx { fingerprint: FingerprintVec, unitigs_path: PathBuf },
|
||||
Hybrid { evidence: Evidence, unitigs: UnitigFileReader, fingerprint: FingerprintVec },
|
||||
}
|
||||
|
||||
// ── MphfLayer ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Autonomous kmer → slot mapping for one layer.
|
||||
///
|
||||
/// Dispatches queries to exact or approximate evidence transparently based on
|
||||
/// the `layer_meta.json` written at build time.
|
||||
/// Two query methods:
|
||||
/// - [`find`](Self::find) — O(1), uses fingerprint (Approx/Hybrid) or exact evidence (Exact).
|
||||
/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers,
|
||||
/// O(n) sequential scan on Approx layers.
|
||||
pub struct MphfLayer {
|
||||
mphf: Mphf,
|
||||
ev: LayerEvidence,
|
||||
@@ -39,21 +42,31 @@ pub struct MphfLayer {
|
||||
}
|
||||
|
||||
impl MphfLayer {
|
||||
pub fn open(dir: &Path) -> OLMResult<Self> {
|
||||
let meta = LayerMeta::load(dir)?;
|
||||
/// Open a layer using the index-level `mode` determined at `LayeredMap` open time.
|
||||
/// No per-layer metadata file is read.
|
||||
pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult<Self> {
|
||||
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
let (ev, n) = match meta.evidence {
|
||||
EvidenceKind::Exact => {
|
||||
let (ev, n) = match mode {
|
||||
IndexMode::Exact => {
|
||||
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
|
||||
let n = evidence.len();
|
||||
// open() auto-detects: uses direct access since exact layers always have .idx
|
||||
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
|
||||
(LayerEvidence::Exact { evidence, unitigs }, n)
|
||||
}
|
||||
EvidenceKind::Approx { .. } => {
|
||||
IndexMode::Approx { .. } => {
|
||||
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
|
||||
let n = fingerprint.n();
|
||||
(LayerEvidence::Approx { fingerprint }, n)
|
||||
let unitigs_path = dir.join(UNITIGS_FILE);
|
||||
(LayerEvidence::Approx { fingerprint, unitigs_path }, n)
|
||||
}
|
||||
IndexMode::Hybrid { .. } => {
|
||||
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
|
||||
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
|
||||
let n = evidence.len();
|
||||
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
|
||||
(LayerEvidence::Hybrid { evidence, unitigs, fingerprint }, n)
|
||||
}
|
||||
};
|
||||
Ok(Self { mphf, ev, n })
|
||||
@@ -61,45 +74,60 @@ impl MphfLayer {
|
||||
|
||||
// ── Query API ─────────────────────────────────────────────────────────────
|
||||
|
||||
/// Transparent dispatch: routes to `find_exact` or `find_approx` based on
|
||||
/// the evidence loaded at `open` time.
|
||||
/// O(1) lookup — dispatches automatically:
|
||||
/// - Exact: evidence + `verify_canonical_kmer`, zero false positives.
|
||||
/// - Approx: fingerprint check, false-positive rate ≈ 1/2^b.
|
||||
/// - Hybrid: fingerprint check (fast path), zero false positives via `find_strict`.
|
||||
#[inline]
|
||||
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||
let slot = self.mphf.index(&kmer.raw());
|
||||
if slot >= self.n { return None; }
|
||||
match &self.ev {
|
||||
LayerEvidence::Exact { .. } => self.find_exact(kmer),
|
||||
LayerEvidence::Approx { .. } => self.find_approx(kmer),
|
||||
LayerEvidence::Exact { evidence, unitigs } => {
|
||||
let (chunk_id, rank) = evidence.decode(slot);
|
||||
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||
Some(slot)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
LayerEvidence::Approx { fingerprint, .. } |
|
||||
LayerEvidence::Hybrid { fingerprint, .. } => {
|
||||
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Exact lookup: zero false positives. Panics if the layer was opened with
|
||||
/// approximate evidence.
|
||||
#[inline]
|
||||
pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||
let LayerEvidence::Exact { evidence, unitigs } = &self.ev else {
|
||||
panic!("find_exact called on an approximate layer");
|
||||
};
|
||||
/// Always-exact lookup — zero false positives regardless of mode.
|
||||
///
|
||||
/// - Exact/Hybrid: O(1) via evidence + `verify_canonical_kmer`.
|
||||
/// - Approx: O(n) sequential scan of `unitigs.bin` to confirm the kmer
|
||||
/// that owns the slot, then exact comparison.
|
||||
pub fn find_strict(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||
let slot = self.mphf.index(&kmer.raw());
|
||||
if slot >= self.n { return None; }
|
||||
let (chunk_id, rank) = evidence.decode(slot);
|
||||
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||
Some(slot)
|
||||
} else {
|
||||
None
|
||||
match &self.ev {
|
||||
LayerEvidence::Exact { evidence, unitigs } |
|
||||
LayerEvidence::Hybrid { evidence, unitigs, .. } => {
|
||||
let (chunk_id, rank) = evidence.decode(slot);
|
||||
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
|
||||
Some(slot)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
LayerEvidence::Approx { unitigs_path, .. } => {
|
||||
let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?;
|
||||
for stored in reader.iter_canonical_kmers() {
|
||||
if self.mphf.index(&stored.raw()) == slot {
|
||||
return if stored == kmer { Some(slot) } else { None };
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics
|
||||
/// if the layer was opened with exact evidence.
|
||||
#[inline]
|
||||
pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
|
||||
let LayerEvidence::Approx { fingerprint } = &self.ev else {
|
||||
panic!("find_approx called on an exact layer");
|
||||
};
|
||||
let slot = self.mphf.index(&kmer.raw());
|
||||
if slot >= self.n { return None; }
|
||||
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
|
||||
// ── Build helpers ─────────────────────────────────────────────────────────
|
||||
@@ -109,19 +137,7 @@ impl MphfLayer {
|
||||
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
|
||||
}
|
||||
|
||||
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
|
||||
/// `kind`. `block_bits` is forwarded to exact evidence only.
|
||||
pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
|
||||
match kind {
|
||||
EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits),
|
||||
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
|
||||
///
|
||||
/// `block_bits` controls the `.idx` block size (2^block_bits chunks per block).
|
||||
/// Uses sequential iteration — no `.idx` required on entry.
|
||||
pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> {
|
||||
let unitig_path = dir.join(UNITIGS_FILE);
|
||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
@@ -130,7 +146,6 @@ impl MphfLayer {
|
||||
if n == 0 {
|
||||
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
@@ -156,13 +171,10 @@ impl MphfLayer {
|
||||
|
||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
|
||||
/// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
|
||||
/// parameter (≥1). No `.idx` is written.
|
||||
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||
if b == 0 || b > 64 {
|
||||
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
||||
@@ -176,7 +188,6 @@ impl MphfLayer {
|
||||
|
||||
if n == 0 {
|
||||
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||
LayerMeta::approx(b, z).save(dir)?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
@@ -194,139 +205,113 @@ impl MphfLayer {
|
||||
}
|
||||
|
||||
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
||||
LayerMeta::approx(b, z).save(dir)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Build MPHF then evidence from the unitigs file already present in `dir`.
|
||||
/// Build MPHF + evidence from `unitigs.bin` already present in `dir`.
|
||||
///
|
||||
/// - Exact: `.idx` is built for pass-1 parallel construction and kept for
|
||||
/// query-time kmer verification. `evidence.bin` is written.
|
||||
/// - Approx: pass-1 uses `open_sequential` + `par_bridge` — no `.idx` is
|
||||
/// ever created. `fingerprint.bin` is written.
|
||||
///
|
||||
/// `fill_slot(slot, kmer)` is called once per kmer in both modes.
|
||||
/// `fill_slot(slot, kmer)` is called once per kmer in all modes.
|
||||
/// No `layer_meta.json` is written — the mode is an index-level property
|
||||
/// stored in `PartitionMeta`.
|
||||
pub(crate) fn build(
|
||||
dir: &Path,
|
||||
block_bits: u8,
|
||||
evidence_kind: &EvidenceKind,
|
||||
mode: &IndexMode,
|
||||
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
|
||||
) -> OLMResult<usize> {
|
||||
use rayon::prelude::*;
|
||||
|
||||
let unitig_path = dir.join(UNITIGS_FILE);
|
||||
let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers();
|
||||
|
||||
match evidence_kind {
|
||||
// ── Exact path ────────────────────────────────────────────────────
|
||||
// .idx is built LAST, once evidence.bin is written, so it is never
|
||||
// present during construction — only at query time.
|
||||
EvidenceKind::Exact => {
|
||||
let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers();
|
||||
let keys = CanonicalKmerIter::new(&unitig_path)
|
||||
.map_err(|e| match e {
|
||||
obiskio::SKError::Io(io) => OLMError::Io(io),
|
||||
e => OLMError::InvalidLayer(e.to_string()),
|
||||
})?;
|
||||
let sk_to_olm = |e: obiskio::SKError| match e {
|
||||
obiskio::SKError::Io(io) => OLMError::Io(io),
|
||||
e => OLMError::InvalidLayer(e.to_string()),
|
||||
};
|
||||
|
||||
if n == 0 {
|
||||
// ── Empty layer ───────────────────────────────────────────────────────
|
||||
if n == 0 {
|
||||
let mphf: Mphf =
|
||||
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
match mode {
|
||||
IndexMode::Exact | IndexMode::Hybrid { .. } => {
|
||||
fs::File::create(dir.join(EVIDENCE_FILE))?;
|
||||
let mphf: Mphf =
|
||||
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
return Ok(0);
|
||||
}
|
||||
IndexMode::Approx { b, .. } => {
|
||||
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||
}
|
||||
}
|
||||
if let IndexMode::Hybrid { b, .. } = mode {
|
||||
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||
}
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Pass 1 — MPHF construction via clonable mmap iterator
|
||||
let mphf: Mphf =
|
||||
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
// ── Pass 1: MPHF via clonable mmap iterator ───────────────────────────
|
||||
let keys = CanonicalKmerIter::new(&unitig_path).map_err(sk_to_olm)?;
|
||||
let mphf: Mphf =
|
||||
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(),
|
||||
PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
// Pass 2 — sequential: fill evidence.bin + callback
|
||||
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
// ── Pass 2: fill evidence files + callback ────────────────────────────
|
||||
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
|
||||
match mode {
|
||||
IndexMode::Exact => {
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("slot out of bounds".into()));
|
||||
}
|
||||
let byte = slot / 8;
|
||||
let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 {
|
||||
return Err(OLMError::Mphf("duplicate slot".into()));
|
||||
}
|
||||
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
|
||||
let byte = slot / 8; let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
|
||||
seen[byte] |= bit;
|
||||
ev.set(slot, chunk_id as u32, rank as u8);
|
||||
fill_slot(slot, kmer)?;
|
||||
}
|
||||
|
||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||
LayerMeta::exact().save(dir)?;
|
||||
// .idx built last: strictly for query-time kmer verification
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
// ── Approx path ───────────────────────────────────────────────────
|
||||
// No .idx is created at any point.
|
||||
EvidenceKind::Approx { b, z } => {
|
||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let n = unitigs.n_kmers();
|
||||
|
||||
if n == 0 {
|
||||
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||
let mphf: Mphf =
|
||||
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
LayerMeta::approx(*b, *z).save(dir)?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Pass 1 — MPHF construction via mmap-backed clonable iterator.
|
||||
// No .idx is created. par_bridge() parallelises the sequential scan;
|
||||
// Clone on CanonicalKmerRawIter shares the Arc<Mmap> and resets to pos 0.
|
||||
let keys = CanonicalKmerIter::new(&unitig_path)
|
||||
.map_err(|e| match e {
|
||||
obiskio::SKError::Io(io) => OLMError::Io(io),
|
||||
e => OLMError::InvalidLayer(e.to_string()),
|
||||
})?;
|
||||
let mphf: Mphf =
|
||||
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
// Pass 2 — sequential: fill fingerprint.bin + callback
|
||||
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let mut fw = FingerprintVecWriter::new(n, *b);
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
|
||||
IndexMode::Approx { b, .. } => {
|
||||
let mut fw = FingerprintVecWriter::new(n, *b);
|
||||
for kmer in unitigs2.iter_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("slot out of bounds".into()));
|
||||
}
|
||||
let byte = slot / 8;
|
||||
let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 {
|
||||
return Err(OLMError::Mphf("duplicate slot".into()));
|
||||
}
|
||||
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
|
||||
let byte = slot / 8; let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
|
||||
seen[byte] |= bit;
|
||||
fw.set(slot, kmer.seq_hash());
|
||||
fill_slot(slot, kmer)?;
|
||||
}
|
||||
|
||||
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
||||
LayerMeta::approx(*b, *z).save(dir)?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
IndexMode::Hybrid { b, .. } => {
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut fw = FingerprintVecWriter::new(n, *b);
|
||||
for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
|
||||
let byte = slot / 8; let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
|
||||
seen[byte] |= bit;
|
||||
ev.set(slot, chunk_id as u32, rank as u8);
|
||||
fw.set(slot, kmer.seq_hash());
|
||||
fill_slot(slot, kmer)?;
|
||||
}
|
||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
||||
build_unitig_idx(&unitig_path, block_bits)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user