refactor: switch indexing to IndexMode and update metadata

Replace EvidenceKind with IndexMode (Exact, Approx, Hybrid) across layer construction and query dispatch. Update PartitionMeta and LayerMeta serialization to centralize index-wide configuration. Add flexible push_layer overloads to LayeredMap for dynamic index expansion without full rebuilds. Improve UnitigFileReader to gracefully fallback to sequential scanning when indexes are missing, eliminating panics.
This commit is contained in:
Eric Coissac
2026-05-26 10:04:25 +02:00
parent 1d880fdc5f
commit 7501b6e854
9 changed files with 284 additions and 315 deletions
+5 -7
View File
@@ -5,7 +5,7 @@ use cacheline_ef::{CachelineEf, CachelineEfVec};
use epserde::prelude::*; use epserde::prelude::*;
use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec}; use obicompactvec::{PersistentCompactIntMatrix, PersistentCompactIntVec};
use obidebruinj::GraphDeBruijn; use obidebruinj::GraphDeBruijn;
use obilayeredmap::{EvidenceKind, OLMError, layer::Layer}; use obilayeredmap::{IndexMode, OLMError, layer::Layer};
use obilayeredmap::meta::PartitionMeta; use obilayeredmap::meta::PartitionMeta;
use obiskio::{SKError, SKFileMeta, SKFileReader}; use obiskio::{SKError, SKFileMeta, SKFileReader};
use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64}; use ptr_hash::{PtrHash, bucket_fn::CubicEps, hash::Xx64};
@@ -44,7 +44,7 @@ impl KmerPartition {
min_ab: u32, min_ab: u32,
max_ab: Option<u32>, max_ab: Option<u32>,
with_counts: bool, with_counts: bool,
evidence: &EvidenceKind, mode: &IndexMode,
block_bits: u8, block_bits: u8,
) -> Result<usize, SKError> { ) -> Result<usize, SKError> {
let part_dir = self.part_dir(i); let part_dir = self.part_dir(i);
@@ -110,7 +110,7 @@ impl KmerPartition {
uw.close()?; uw.close()?;
if with_counts { if with_counts {
Layer::<PersistentCompactIntMatrix>::build(&layer_dir, block_bits, evidence, |kmer| { Layer::<PersistentCompactIntMatrix>::build(&layer_dir, block_bits, mode, |kmer| {
match (&mphf1_opt, &counts1_opt) { match (&mphf1_opt, &counts1_opt) {
(Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())), (Some(mphf), Some(counts)) => counts.get(mphf.index(&kmer.raw())),
_ => 1, _ => 1,
@@ -118,13 +118,11 @@ impl KmerPartition {
}) })
.map_err(olm_to_sk)?; .map_err(olm_to_sk)?;
} else { } else {
Layer::<()>::build(&layer_dir, block_bits, evidence).map_err(olm_to_sk)?; Layer::<()>::build(&layer_dir, block_bits, mode).map_err(olm_to_sk)?;
} }
// Write meta.json in the index/ directory so LayeredMap::open works
// (e.g. for subsequent merge operations).
let index_dir = layer_dir.parent().expect("layer_dir has a parent"); let index_dir = layer_dir.parent().expect("layer_dir has a parent");
PartitionMeta { n_layers: 1 }.save(index_dir).map_err(olm_to_sk)?; PartitionMeta { n_layers: 1, mode: mode.clone() }.save(index_dir).map_err(olm_to_sk)?;
Ok(n_kmers) Ok(n_kmers)
} }
+10 -11
View File
@@ -9,7 +9,7 @@ use obicompactvec::{PersistentBitMatrix, PersistentBitMatrixBuilder,
PersistentCompactIntVecBuilder}; PersistentCompactIntVecBuilder};
use obikseq::CanonicalKmer; use obikseq::CanonicalKmer;
use obiskio::{SKError, SKResult, UnitigFileReader}; use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{EvidenceKind, Layer, LayeredMap, MphfLayer, OLMError}; use obilayeredmap::{IndexMode, Layer, LayeredMap, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta; use obilayeredmap::meta::PartitionMeta;
use crate::partition::KmerPartition; use crate::partition::KmerPartition;
@@ -52,18 +52,17 @@ pub(crate) enum SrcLayerData {
} }
impl SrcLayerData { impl SrcLayerData {
pub(crate) fn open(layer_dir: &Path, mode: MergeMode) -> SKResult<Self> { pub(crate) fn open(layer_dir: &Path, merge_mode: MergeMode, index_mode: &IndexMode) -> SKResult<Self> {
let presence_dir = layer_dir.join("presence"); let presence_dir = layer_dir.join("presence");
let counts_dir = layer_dir.join("counts"); let counts_dir = layer_dir.join("counts");
match mode { match merge_mode {
MergeMode::Presence => { MergeMode::Presence => {
if presence_dir.exists() { if presence_dir.exists() {
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?; let mat = PersistentBitMatrix::open(&presence_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Presence(mphf, mat)) Ok(SrcLayerData::Presence(mphf, mat))
} else if counts_dir.exists() { } else if counts_dir.exists() {
// Source is a count index; treat count > 0 as present via ColBuilder::Bit. let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat)) Ok(SrcLayerData::Count(mphf, mat))
} else { } else {
@@ -72,7 +71,7 @@ impl SrcLayerData {
} }
MergeMode::Count => { MergeMode::Count => {
if counts_dir.exists() { if counts_dir.exists() {
let mphf = MphfLayer::open(layer_dir).map_err(olm_to_sk)?; let mphf = MphfLayer::open(layer_dir, index_mode).map_err(olm_to_sk)?;
let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?; let mat = PersistentCompactIntMatrix::open(&counts_dir).map_err(SKError::Io)?;
Ok(SrcLayerData::Count(mphf, mat)) Ok(SrcLayerData::Count(mphf, mat))
} else { } else {
@@ -116,7 +115,7 @@ fn load_meta(dir: &Path) -> SKResult<PartitionMeta> {
Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => { Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) => {
let mut n = 0usize; let mut n = 0usize;
while dir.join(format!("layer_{n}")).exists() { n += 1; } while dir.join(format!("layer_{n}")).exists() { n += 1; }
let m = PartitionMeta { n_layers: n }; let m = PartitionMeta { n_layers: n, mode: IndexMode::default() };
m.save(dir).map_err(olm_to_sk)?; m.save(dir).map_err(olm_to_sk)?;
Ok(m) Ok(m)
} }
@@ -217,12 +216,12 @@ impl KmerPartition {
uw.write(&unitig)?; uw.write(&unitig)?;
} }
uw.close()?; uw.close()?;
Layer::<()>::build(&new_layer_dir, block_bits, &EvidenceKind::Exact).map_err(olm_to_sk)?; Layer::<()>::build(&new_layer_dir, block_bits, &IndexMode::Exact).map_err(olm_to_sk)?;
} }
drop(g); drop(g);
let new_mphf = if any_new { let new_mphf = if any_new {
Some(MphfLayer::open(&new_layer_dir).map_err(olm_to_sk)?) Some(MphfLayer::open(&new_layer_dir, &IndexMode::Exact).map_err(olm_to_sk)?)
} else { } else {
None None
}; };
@@ -304,7 +303,7 @@ impl KmerPartition {
for l in 0..src_meta.n_layers { for l in 0..src_meta.n_layers {
let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let src_layer_dir = src_index_dir.join(format!("layer_{l}"));
let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?; let reader = UnitigFileReader::open_sequential(&src_layer_dir.join("unitigs.bin"))?;
let src_data = SrcLayerData::open(&src_layer_dir, mode)?; let src_data = SrcLayerData::open(&src_layer_dir, mode, &src_meta.mode)?;
for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
let values = src_data.lookup(kmer, *src_n); let values = src_data.lookup(kmer, *src_n);
+1 -1
View File
@@ -8,7 +8,7 @@ use obicompactvec::{PersistentBitMatrixBuilder,
PersistentCompactIntVecBuilder}; PersistentCompactIntVecBuilder};
use obidebruinj::GraphDeBruijn; use obidebruinj::GraphDeBruijn;
use obiskio::{SKError, SKResult, UnitigFileReader}; use obiskio::{SKError, SKResult, UnitigFileReader};
use obilayeredmap::{EvidenceKind, Layer, MphfLayer, OLMError}; use obilayeredmap::{IndexMode, Layer, MphfLayer, OLMError};
use obilayeredmap::meta::PartitionMeta; use obilayeredmap::meta::PartitionMeta;
use crate::filter::KmerFilter; use crate::filter::KmerFilter;
+11 -16
View File
@@ -10,7 +10,7 @@ use obikseq::CanonicalKmer;
use obiskio::{UnitigFileReader, UnitigFileWriter}; use obiskio::{UnitigFileReader, UnitigFileWriter};
use crate::error::{OLMError, OLMResult}; use crate::error::{OLMError, OLMResult};
use crate::meta::EvidenceKind; use crate::meta::IndexMode;
use crate::mphf_layer::MphfLayer; use crate::mphf_layer::MphfLayer;
pub(crate) use crate::mphf_layer::UNITIGS_FILE; pub(crate) use crate::mphf_layer::UNITIGS_FILE;
@@ -62,8 +62,8 @@ pub struct Hit<T = ()> {
// ── Common read path ────────────────────────────────────────────────────────── // ── Common read path ──────────────────────────────────────────────────────────
impl<D: LayerData> Layer<D> { impl<D: LayerData> Layer<D> {
pub fn open(path: &Path) -> OLMResult<Self> { pub fn open(path: &Path, mode: &IndexMode) -> OLMResult<Self> {
let mphf = MphfLayer::open(path)?; let mphf = MphfLayer::open(path, mode)?;
let data = D::open(path)?; let data = D::open(path)?;
Ok(Self { mphf, data }) Ok(Self { mphf, data })
} }
@@ -92,18 +92,13 @@ impl<D: LayerData> Layer<D> {
MphfLayer::build_approx_evidence(layer_dir, b, z) MphfLayer::build_approx_evidence(layer_dir, b, z)
} }
/// Dispatch to `build_exact_evidence` or `build_approx_evidence`.
/// `block_bits` is forwarded to exact evidence only.
pub fn build_evidence(layer_dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
MphfLayer::build_evidence(layer_dir, kind, block_bits)
}
} }
// ── Mode 1 — set membership ─────────────────────────────────────────────────── // ── Mode 1 — set membership ───────────────────────────────────────────────────
impl Layer<()> { impl Layer<()> {
pub fn build(out_dir: &Path, block_bits: u8, evidence_kind: &EvidenceKind) -> OLMResult<usize> { pub fn build(out_dir: &Path, block_bits: u8, mode: &IndexMode) -> OLMResult<usize> {
MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |_, _| Ok(())) MphfLayer::build(out_dir, block_bits, mode, &mut |_, _| Ok(()))
} }
/// Create a presence matrix for a set-membership layer (first merge). /// Create a presence matrix for a set-membership layer (first merge).
@@ -126,7 +121,7 @@ impl Layer<PersistentCompactIntMatrix> {
pub fn build( pub fn build(
out_dir: &Path, out_dir: &Path,
block_bits: u8, block_bits: u8,
evidence_kind: &EvidenceKind, mode: &IndexMode,
count_of: impl Fn(CanonicalKmer) -> u32, count_of: impl Fn(CanonicalKmer) -> u32,
) -> OLMResult<usize> { ) -> OLMResult<usize> {
let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers(); let n = UnitigFileReader::open_sequential(&out_dir.join(UNITIGS_FILE))?.n_kmers();
@@ -134,7 +129,7 @@ impl Layer<PersistentCompactIntMatrix> {
let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir) let mut mb = PersistentCompactIntMatrixBuilder::new(n, &counts_dir)
.map_err(OLMError::Io)?; .map_err(OLMError::Io)?;
let mut col = mb.add_col().map_err(OLMError::Io)?; let mut col = mb.add_col().map_err(OLMError::Io)?;
let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| {
col.set(slot, count_of(kmer)); col.set(slot, count_of(kmer));
Ok(()) Ok(())
})?; })?;
@@ -146,10 +141,10 @@ impl Layer<PersistentCompactIntMatrix> {
pub fn build_from_map( pub fn build_from_map(
out_dir: &Path, out_dir: &Path,
block_bits: u8, block_bits: u8,
evidence_kind: &EvidenceKind, mode: &IndexMode,
counts: &HashMap<CanonicalKmer, u32>, counts: &HashMap<CanonicalKmer, u32>,
) -> OLMResult<usize> { ) -> OLMResult<usize> {
Self::build(out_dir, block_bits, evidence_kind, |kmer| counts.get(&kmer).copied().unwrap_or(0)) Self::build(out_dir, block_bits, mode, |kmer| counts.get(&kmer).copied().unwrap_or(0))
} }
} }
@@ -179,7 +174,7 @@ impl Layer<PersistentBitMatrix> {
pub fn build_presence( pub fn build_presence(
out_dir: &Path, out_dir: &Path,
block_bits: u8, block_bits: u8,
evidence_kind: &EvidenceKind, mode: &IndexMode,
n_genomes: usize, n_genomes: usize,
present_in: impl Fn(CanonicalKmer, usize) -> bool, present_in: impl Fn(CanonicalKmer, usize) -> bool,
) -> OLMResult<usize> { ) -> OLMResult<usize> {
@@ -189,7 +184,7 @@ impl Layer<PersistentBitMatrix> {
let mut cols: Vec<_> = (0..n_genomes) let mut cols: Vec<_> = (0..n_genomes)
.map(|_| mb.add_col().map_err(OLMError::Io)) .map(|_| mb.add_col().map_err(OLMError::Io))
.collect::<OLMResult<_>>()?; .collect::<OLMResult<_>>()?;
let n_built = MphfLayer::build(out_dir, block_bits, evidence_kind, &mut |slot, kmer| { let n_built = MphfLayer::build(out_dir, block_bits, mode, &mut |slot, kmer| {
for (g, col) in cols.iter_mut().enumerate() { for (g, col) in cols.iter_mut().enumerate() {
col.set(slot, present_in(kmer, g)); col.set(slot, present_in(kmer, g));
} }
+1 -1
View File
@@ -11,5 +11,5 @@ pub use error::{OLMError, OLMResult};
pub use layer::{Hit, Layer, LayerData}; pub use layer::{Hit, Layer, LayerData};
pub use layered_store::LayeredStore; pub use layered_store::LayeredStore;
pub use map::LayeredMap; pub use map::LayeredMap;
pub use meta::{EvidenceKind, LayerMeta}; pub use meta::{IndexMode, PartitionMeta};
pub use mphf_layer::MphfLayer; pub use mphf_layer::MphfLayer;
+17 -36
View File
@@ -5,11 +5,10 @@ use std::path::{Path, PathBuf};
use obicompactvec::PersistentCompactIntMatrix; use obicompactvec::PersistentCompactIntMatrix;
use obikseq::CanonicalKmer; use obikseq::CanonicalKmer;
use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS}; use obiskio::{UnitigFileWriter, DEFAULT_BLOCK_BITS};
use crate::meta::EvidenceKind;
use crate::error::OLMResult; use crate::error::OLMResult;
use crate::layer::{Hit, Layer, LayerData}; use crate::layer::{Hit, Layer, LayerData};
use crate::meta::PartitionMeta; use crate::meta::{IndexMode, PartitionMeta};
/// Layered kmer index for a single partition. /// Layered kmer index for a single partition.
/// ///
@@ -17,8 +16,8 @@ use crate::meta::PartitionMeta;
/// the first match wins. Adding a dataset appends a new layer without /// the first match wins. Adding a dataset appends a new layer without
/// rebuilding existing ones. /// rebuilding existing ones.
pub struct LayeredMap<D: LayerData = ()> { pub struct LayeredMap<D: LayerData = ()> {
root: PathBuf, root: PathBuf,
meta: PartitionMeta, meta: PartitionMeta,
layers: Vec<Layer<D>>, layers: Vec<Layer<D>>,
} }
@@ -26,39 +25,26 @@ pub struct LayeredMap<D: LayerData = ()> {
impl<D: LayerData> LayeredMap<D> { impl<D: LayerData> LayeredMap<D> {
/// Open an existing layered index at `root`. /// Open an existing layered index at `root`.
/// The mode is read once from `PartitionMeta` and applied to all layers.
pub fn open(root: &Path) -> OLMResult<Self> { pub fn open(root: &Path) -> OLMResult<Self> {
let meta = PartitionMeta::load(root)?; let meta = PartitionMeta::load(root)?;
let layers = (0..meta.n_layers) let layers = (0..meta.n_layers)
.map(|i| Layer::<D>::open(&layer_dir(root, i))) .map(|i| Layer::<D>::open(&layer_dir(root, i), &meta.mode))
.collect::<OLMResult<Vec<_>>>()?; .collect::<OLMResult<Vec<_>>>()?;
Ok(Self { Ok(Self { root: root.to_owned(), meta, layers })
root: root.to_owned(),
meta,
layers,
})
} }
/// Create a new, empty layered index at `root`. /// Create a new, empty layered index at `root` with the given mode.
pub fn create(root: &Path) -> OLMResult<Self> { pub fn create(root: &Path, mode: IndexMode) -> OLMResult<Self> {
fs::create_dir_all(root)?; fs::create_dir_all(root)?;
let meta = PartitionMeta::new(); let meta = PartitionMeta::new(mode);
meta.save(root)?; meta.save(root)?;
Ok(Self { Ok(Self { root: root.to_owned(), meta, layers: Vec::new() })
root: root.to_owned(),
meta,
layers: Vec::new(),
})
} }
/// Return the number of layers in this index. pub fn n_layers(&self) -> usize { self.layers.len() }
pub fn n_layers(&self) -> usize { pub fn layer(&self, i: usize) -> &Layer<D> { &self.layers[i] }
self.layers.len() pub fn mode(&self) -> &IndexMode { &self.meta.mode }
}
/// Return a reference to the `i`-th layer.
pub fn layer(&self, i: usize) -> &Layer<D> {
&self.layers[i]
}
/// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match. /// Query `kmer` across all layers. Returns `(layer_index, Hit)` on match.
pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit<D::Item>)> { pub fn query(&self, kmer: CanonicalKmer) -> Option<(usize, Hit<D::Item>)> {
@@ -68,17 +54,15 @@ impl<D: LayerData> LayeredMap<D> {
.find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit))) .find_map(|(i, layer)| layer.query(kmer).map(|hit| (i, hit)))
} }
/// Return a `UnitigFileWriter` for the next layer to be built.
pub fn next_layer_writer(&self) -> OLMResult<UnitigFileWriter> { pub fn next_layer_writer(&self) -> OLMResult<UnitigFileWriter> {
let dir = layer_dir(&self.root, self.layers.len()); let dir = layer_dir(&self.root, self.layers.len());
Layer::<D>::unitig_writer(&dir) Layer::<D>::unitig_writer(&dir)
} }
/// Append a new layer to the index.
fn append_layer(&mut self) -> OLMResult<()> { fn append_layer(&mut self) -> OLMResult<()> {
let i = self.layers.len(); let i = self.layers.len();
let dir = layer_dir(&self.root, i); let dir = layer_dir(&self.root, i);
self.layers.push(Layer::<D>::open(&dir)?); self.layers.push(Layer::<D>::open(&dir, &self.meta.mode)?);
self.meta.n_layers = self.layers.len(); self.meta.n_layers = self.layers.len();
self.meta.save(&self.root)?; self.meta.save(&self.root)?;
Ok(()) Ok(())
@@ -91,7 +75,7 @@ impl LayeredMap<()> {
pub fn push_layer(&mut self) -> OLMResult<usize> { pub fn push_layer(&mut self) -> OLMResult<usize> {
let i = self.layers.len(); let i = self.layers.len();
let dir = layer_dir(&self.root, i); let dir = layer_dir(&self.root, i);
Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact)?; Layer::<()>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode)?;
self.append_layer()?; self.append_layer()?;
Ok(i) Ok(i)
} }
@@ -103,15 +87,12 @@ impl LayeredMap<PersistentCompactIntMatrix> {
pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> { pub fn push_layer(&mut self, count_of: impl Fn(CanonicalKmer) -> u32) -> OLMResult<usize> {
let i = self.layers.len(); let i = self.layers.len();
let dir = layer_dir(&self.root, i); let dir = layer_dir(&self.root, i);
Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, &EvidenceKind::Exact, count_of)?; Layer::<PersistentCompactIntMatrix>::build(&dir, DEFAULT_BLOCK_BITS, &self.meta.mode, count_of)?;
self.append_layer()?; self.append_layer()?;
Ok(i) Ok(i)
} }
pub fn push_layer_from_map( pub fn push_layer_from_map(&mut self, counts: &HashMap<CanonicalKmer, u32>) -> OLMResult<usize> {
&mut self,
counts: &HashMap<CanonicalKmer, u32>,
) -> OLMResult<usize> {
self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0)) self.push_layer(|kmer| counts.get(&kmer).copied().unwrap_or(0))
} }
} }
+20 -40
View File
@@ -5,65 +5,45 @@ use serde::{Deserialize, Serialize};
use crate::error::OLMResult; use crate::error::OLMResult;
const META_FILE: &str = "meta.json"; const META_FILE: &str = "meta.json";
const LAYER_META_FILE: &str = "layer_meta.json";
// ── Layer-level metadata ────────────────────────────────────────────────────── // ── IndexMode ─────────────────────────────────────────────────────────────────
/// Describes the evidence bundle stored alongside the MPHF for one layer. /// Evidence mode for an entire partitioned index — homogeneous across all layers.
///
/// Determined once at build time; stored in `PartitionMeta` (`meta.json`).
/// All layers within an index share the same mode.
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")] #[serde(tag = "type", rename_all = "snake_case")]
pub enum EvidenceKind { pub enum IndexMode {
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives. /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
Exact, Exact,
/// Approximate evidence: `fingerprint.bin` only. /// Approximate evidence: `fingerprint.bin` only.
/// `b` — fingerprint bits; false-positive rate per k-mer query = 1/2^b. /// `b` — fingerprint bits per slot; false-positive rate ≈ 1/2^b per query.
/// `z` — consecutive k-mers that must all match (Findere trick); /// `z` — Findere consecutive-kmer parameter (build-time only; not used at query time).
/// effective FP rate per sequencing read ≈ W / 2^(b·z)
/// where W = L - k - z + 2 is the number of windows in a read of length L.
Approx { b: u8, z: u8 }, Approx { b: u8, z: u8 },
/// Hybrid: both `fingerprint.bin` and `evidence.bin` + `unitigs.bin.idx`.
/// `find()` uses the fingerprint (O(1), approx); `find_strict()` uses exact evidence.
Hybrid { b: u8, z: u8 },
} }
#[derive(Debug, Clone, Serialize, Deserialize)] impl Default for IndexMode {
pub struct LayerMeta {
pub evidence: EvidenceKind,
}
impl Default for EvidenceKind {
fn default() -> Self { Self::Exact } fn default() -> Self { Self::Exact }
} }
impl LayerMeta { // ── PartitionMeta ─────────────────────────────────────────────────────────────
pub fn exact() -> Self {
Self { evidence: EvidenceKind::Exact }
}
pub fn approx(b: u8, z: u8) -> Self {
Self { evidence: EvidenceKind::Approx { b, z } }
}
pub fn load(layer_dir: &Path) -> OLMResult<Self> {
let f = File::open(layer_dir.join(LAYER_META_FILE))?;
Ok(serde_json::from_reader(f)?)
}
pub fn save(&self, layer_dir: &Path) -> OLMResult<()> {
let f = File::create(layer_dir.join(LAYER_META_FILE))?;
serde_json::to_writer_pretty(f, self)?;
Ok(())
}
}
// ── Partition-level metadata ──────────────────────────────────────────────────
/// Index-level metadata stored in `meta.json` at the root of a partition index.
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartitionMeta { pub struct PartitionMeta {
pub n_layers: usize, pub n_layers: usize,
#[serde(default)]
pub mode: IndexMode,
} }
impl PartitionMeta { impl PartitionMeta {
pub fn new() -> Self { pub fn new(mode: IndexMode) -> Self {
Self { n_layers: 0 } Self { n_layers: 0, mode }
} }
pub fn load(dir: &Path) -> OLMResult<Self> { pub fn load(dir: &Path) -> OLMResult<Self> {
@@ -79,5 +59,5 @@ impl PartitionMeta {
} }
impl Default for PartitionMeta { impl Default for PartitionMeta {
fn default() -> Self { Self::new() } fn default() -> Self { Self::new(IndexMode::Exact) }
} }
+138 -153
View File
@@ -1,5 +1,5 @@
use std::fs; use std::fs;
use std::path::Path; use std::path::{Path, PathBuf};
use cacheline_ef::{CachelineEf, CachelineEfVec}; use cacheline_ef::{CachelineEf, CachelineEfVec};
use epserde::prelude::*; use epserde::prelude::*;
@@ -10,7 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::error::{OLMError, OLMResult}; use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter}; use crate::evidence::{Evidence, EvidenceWriter};
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
use crate::meta::{EvidenceKind, LayerMeta}; use crate::meta::IndexMode;
pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
@@ -19,19 +19,22 @@ pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>; pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
// ── Evidence store ──────────────────────────────────────────────────────────── // ── LayerEvidence ────────────────────────────────────────────────────────────
enum LayerEvidence { enum LayerEvidence {
Exact { evidence: Evidence, unitigs: UnitigFileReader }, Exact { evidence: Evidence, unitigs: UnitigFileReader },
Approx { fingerprint: FingerprintVec }, Approx { fingerprint: FingerprintVec, unitigs_path: PathBuf },
Hybrid { evidence: Evidence, unitigs: UnitigFileReader, fingerprint: FingerprintVec },
} }
// ── MphfLayer ───────────────────────────────────────────────────────────────── // ── MphfLayer ─────────────────────────────────────────────────────────────────
/// Autonomous kmer → slot mapping for one layer. /// Autonomous kmer → slot mapping for one layer.
/// ///
/// Dispatches queries to exact or approximate evidence transparently based on /// Two query methods:
/// the `layer_meta.json` written at build time. /// - [`find`](Self::find) — O(1), uses fingerprint (Approx/Hybrid) or exact evidence (Exact).
/// - [`find_strict`](Self::find_strict) — always exact; O(1) on Exact/Hybrid layers,
/// O(n) sequential scan on Approx layers.
pub struct MphfLayer { pub struct MphfLayer {
mphf: Mphf, mphf: Mphf,
ev: LayerEvidence, ev: LayerEvidence,
@@ -39,21 +42,31 @@ pub struct MphfLayer {
} }
impl MphfLayer { impl MphfLayer {
pub fn open(dir: &Path) -> OLMResult<Self> { /// Open a layer using the index-level `mode` determined at `LayeredMap` open time.
let meta = LayerMeta::load(dir)?; /// No per-layer metadata file is read.
pub fn open(dir: &Path, mode: &IndexMode) -> OLMResult<Self> {
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?; .map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
let (ev, n) = match meta.evidence { let (ev, n) = match mode {
EvidenceKind::Exact => { IndexMode::Exact => {
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?; let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
let n = evidence.len(); let n = evidence.len();
// open() auto-detects: uses direct access since exact layers always have .idx
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?; let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
(LayerEvidence::Exact { evidence, unitigs }, n) (LayerEvidence::Exact { evidence, unitigs }, n)
} }
EvidenceKind::Approx { .. } => { IndexMode::Approx { .. } => {
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?; let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
let n = fingerprint.n(); let n = fingerprint.n();
(LayerEvidence::Approx { fingerprint }, n) let unitigs_path = dir.join(UNITIGS_FILE);
(LayerEvidence::Approx { fingerprint, unitigs_path }, n)
}
IndexMode::Hybrid { .. } => {
let evidence = Evidence::open(&dir.join(EVIDENCE_FILE))?;
let fingerprint = FingerprintVec::open(&dir.join(FINGERPRINT_FILE))?;
let n = evidence.len();
let unitigs = UnitigFileReader::open(&dir.join(UNITIGS_FILE))?;
(LayerEvidence::Hybrid { evidence, unitigs, fingerprint }, n)
} }
}; };
Ok(Self { mphf, ev, n }) Ok(Self { mphf, ev, n })
@@ -61,45 +74,60 @@ impl MphfLayer {
// ── Query API ───────────────────────────────────────────────────────────── // ── Query API ─────────────────────────────────────────────────────────────
/// Transparent dispatch: routes to `find_exact` or `find_approx` based on /// O(1) lookup — dispatches automatically:
/// the evidence loaded at `open` time. /// - Exact: evidence + `verify_canonical_kmer`, zero false positives.
/// - Approx: fingerprint check, false-positive rate ≈ 1/2^b.
/// - Hybrid: fingerprint check (fast path), zero false positives via `find_strict`.
#[inline] #[inline]
pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> { pub fn find(&self, kmer: CanonicalKmer) -> Option<usize> {
let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; }
match &self.ev { match &self.ev {
LayerEvidence::Exact { .. } => self.find_exact(kmer), LayerEvidence::Exact { evidence, unitigs } => {
LayerEvidence::Approx { .. } => self.find_approx(kmer), let (chunk_id, rank) = evidence.decode(slot);
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
Some(slot)
} else {
None
}
}
LayerEvidence::Approx { fingerprint, .. } |
LayerEvidence::Hybrid { fingerprint, .. } => {
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
}
} }
} }
/// Exact lookup: zero false positives. Panics if the layer was opened with /// Always-exact lookup zero false positives regardless of mode.
/// approximate evidence. ///
#[inline] /// - Exact/Hybrid: O(1) via evidence + `verify_canonical_kmer`.
pub fn find_exact(&self, kmer: CanonicalKmer) -> Option<usize> { /// - Approx: O(n) sequential scan of `unitigs.bin` to confirm the kmer
let LayerEvidence::Exact { evidence, unitigs } = &self.ev else { /// that owns the slot, then exact comparison.
panic!("find_exact called on an approximate layer"); pub fn find_strict(&self, kmer: CanonicalKmer) -> Option<usize> {
};
let slot = self.mphf.index(&kmer.raw()); let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; } if slot >= self.n { return None; }
let (chunk_id, rank) = evidence.decode(slot); match &self.ev {
if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) { LayerEvidence::Exact { evidence, unitigs } |
Some(slot) LayerEvidence::Hybrid { evidence, unitigs, .. } => {
} else { let (chunk_id, rank) = evidence.decode(slot);
None if unitigs.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer) {
Some(slot)
} else {
None
}
}
LayerEvidence::Approx { unitigs_path, .. } => {
let reader = UnitigFileReader::open_sequential(unitigs_path).ok()?;
for stored in reader.iter_canonical_kmers() {
if self.mphf.index(&stored.raw()) == slot {
return if stored == kmer { Some(slot) } else { None };
}
}
None
}
} }
} }
/// Approximate lookup: false-positive rate 1/2^b per k-mer query. Panics
/// if the layer was opened with exact evidence.
#[inline]
pub fn find_approx(&self, kmer: CanonicalKmer) -> Option<usize> {
let LayerEvidence::Approx { fingerprint } = &self.ev else {
panic!("find_approx called on an exact layer");
};
let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; }
if fingerprint.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
}
pub fn n(&self) -> usize { self.n } pub fn n(&self) -> usize { self.n }
// ── Build helpers ───────────────────────────────────────────────────────── // ── Build helpers ─────────────────────────────────────────────────────────
@@ -109,19 +137,7 @@ impl MphfLayer {
Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?) Ok(UnitigFileWriter::create(&dir.join(UNITIGS_FILE))?)
} }
/// Dispatch to `build_exact_evidence` or `build_approx_evidence` based on
/// `kind`. `block_bits` is forwarded to exact evidence only.
pub fn build_evidence(dir: &Path, kind: &EvidenceKind, block_bits: u8) -> OLMResult<usize> {
match kind {
EvidenceKind::Exact => Self::build_exact_evidence(dir, block_bits),
EvidenceKind::Approx { b, z } => Self::build_approx_evidence(dir, *b, *z),
}
}
/// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`. /// Build `evidence.bin` + `unitigs.bin.idx` from `unitigs.bin` + `mphf.bin`.
///
/// `block_bits` controls the `.idx` block size (2^block_bits chunks per block).
/// Uses sequential iteration — no `.idx` required on entry.
pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> { pub fn build_exact_evidence(dir: &Path, block_bits: u8) -> OLMResult<usize> {
let unitig_path = dir.join(UNITIGS_FILE); let unitig_path = dir.join(UNITIGS_FILE);
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
@@ -130,7 +146,6 @@ impl MphfLayer {
if n == 0 { if n == 0 {
fs::File::create(dir.join(EVIDENCE_FILE))?; fs::File::create(dir.join(EVIDENCE_FILE))?;
build_unitig_idx(&unitig_path, block_bits)?; build_unitig_idx(&unitig_path, block_bits)?;
LayerMeta::exact().save(dir)?;
return Ok(0); return Ok(0);
} }
@@ -156,13 +171,10 @@ impl MphfLayer {
ev.write(&dir.join(EVIDENCE_FILE))?; ev.write(&dir.join(EVIDENCE_FILE))?;
build_unitig_idx(&unitig_path, block_bits)?; build_unitig_idx(&unitig_path, block_bits)?;
LayerMeta::exact().save(dir)?;
Ok(n) Ok(n)
} }
/// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`. /// Build `fingerprint.bin` from `unitigs.bin` + `mphf.bin`.
/// `b` — fingerprint bits (1..=64); `z` — Findere consecutive k-mer
/// parameter (≥1). No `.idx` is written.
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> { pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
if b == 0 || b > 64 { if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
@@ -176,7 +188,6 @@ impl MphfLayer {
if n == 0 { if n == 0 {
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
return Ok(0); return Ok(0);
} }
@@ -194,139 +205,113 @@ impl MphfLayer {
} }
fw.write(&dir.join(FINGERPRINT_FILE))?; fw.write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
Ok(n) Ok(n)
} }
/// Build MPHF then evidence from the unitigs file already present in `dir`. /// Build MPHF + evidence from `unitigs.bin` already present in `dir`.
/// ///
/// - Exact: `.idx` is built for pass-1 parallel construction and kept for /// `fill_slot(slot, kmer)` is called once per kmer in all modes.
/// query-time kmer verification. `evidence.bin` is written. /// No `layer_meta.json` is written — the mode is an index-level property
/// - Approx: pass-1 uses `open_sequential` + `par_bridge` — no `.idx` is /// stored in `PartitionMeta`.
/// ever created. `fingerprint.bin` is written.
///
/// `fill_slot(slot, kmer)` is called once per kmer in both modes.
pub(crate) fn build( pub(crate) fn build(
dir: &Path, dir: &Path,
block_bits: u8, block_bits: u8,
evidence_kind: &EvidenceKind, mode: &IndexMode,
fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>, fill_slot: &mut impl FnMut(usize, CanonicalKmer) -> OLMResult<()>,
) -> OLMResult<usize> { ) -> OLMResult<usize> {
use rayon::prelude::*; use rayon::prelude::*;
let unitig_path = dir.join(UNITIGS_FILE); let unitig_path = dir.join(UNITIGS_FILE);
let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers();
match evidence_kind { let sk_to_olm = |e: obiskio::SKError| match e {
// ── Exact path ──────────────────────────────────────────────────── obiskio::SKError::Io(io) => OLMError::Io(io),
// .idx is built LAST, once evidence.bin is written, so it is never e => OLMError::InvalidLayer(e.to_string()),
// present during construction — only at query time. };
EvidenceKind::Exact => {
let n = UnitigFileReader::open_sequential(&unitig_path)?.n_kmers();
let keys = CanonicalKmerIter::new(&unitig_path)
.map_err(|e| match e {
obiskio::SKError::Io(io) => OLMError::Io(io),
e => OLMError::InvalidLayer(e.to_string()),
})?;
if n == 0 { // ── Empty layer ───────────────────────────────────────────────────────
if n == 0 {
let mphf: Mphf =
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
match mode {
IndexMode::Exact | IndexMode::Hybrid { .. } => {
fs::File::create(dir.join(EVIDENCE_FILE))?; fs::File::create(dir.join(EVIDENCE_FILE))?;
let mphf: Mphf =
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
LayerMeta::exact().save(dir)?;
build_unitig_idx(&unitig_path, block_bits)?; build_unitig_idx(&unitig_path, block_bits)?;
return Ok(0);
} }
IndexMode::Approx { b, .. } => {
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
}
}
if let IndexMode::Hybrid { b, .. } = mode {
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
}
return Ok(0);
}
// Pass 1 MPHF construction via clonable mmap iterator // ── Pass 1: MPHF via clonable mmap iterator ───────────────────────────
let mphf: Mphf = let keys = CanonicalKmerIter::new(&unitig_path).map_err(sk_to_olm)?;
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::<CubicEps>::default()); let mphf: Mphf =
mphf.store(&dir.join(MPHF_FILE)) Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(),
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?; PtrHashParams::<CubicEps>::default());
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
// Pass 2 — sequential: fill evidence.bin + callback // ── Pass 2: fill evidence files + callback ────────────────────────────
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?; let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
let mut ev = EvidenceWriter::new(n); let mut seen = vec![0u8; (n + 7) / 8];
let mut seen = vec![0u8; (n + 7) / 8];
match mode {
IndexMode::Exact => {
let mut ev = EvidenceWriter::new(n);
for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() { for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() {
let slot = mphf.index(&kmer.raw()); let slot = mphf.index(&kmer.raw());
if slot >= n { if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
return Err(OLMError::Mphf("slot out of bounds".into())); let byte = slot / 8; let bit = 1u8 << (slot % 8);
} if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
let byte = slot / 8;
let bit = 1u8 << (slot % 8);
if seen[byte] & bit != 0 {
return Err(OLMError::Mphf("duplicate slot".into()));
}
seen[byte] |= bit; seen[byte] |= bit;
ev.set(slot, chunk_id as u32, rank as u8); ev.set(slot, chunk_id as u32, rank as u8);
fill_slot(slot, kmer)?; fill_slot(slot, kmer)?;
} }
ev.write(&dir.join(EVIDENCE_FILE))?; ev.write(&dir.join(EVIDENCE_FILE))?;
LayerMeta::exact().save(dir)?;
// .idx built last: strictly for query-time kmer verification
build_unitig_idx(&unitig_path, block_bits)?; build_unitig_idx(&unitig_path, block_bits)?;
Ok(n)
} }
// ── Approx path ─────────────────────────────────────────────────── IndexMode::Approx { b, .. } => {
// No .idx is created at any point. let mut fw = FingerprintVecWriter::new(n, *b);
EvidenceKind::Approx { b, z } => {
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
let n = unitigs.n_kmers();
if n == 0 {
FingerprintVecWriter::new(0, *b).write(&dir.join(FINGERPRINT_FILE))?;
let mphf: Mphf =
Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
LayerMeta::approx(*b, *z).save(dir)?;
return Ok(0);
}
// Pass 1 — MPHF construction via mmap-backed clonable iterator.
// No .idx is created. par_bridge() parallelises the sequential scan;
// Clone on CanonicalKmerRawIter shares the Arc<Mmap> and resets to pos 0.
let keys = CanonicalKmerIter::new(&unitig_path)
.map_err(|e| match e {
obiskio::SKError::Io(io) => OLMError::Io(io),
e => OLMError::InvalidLayer(e.to_string()),
})?;
let mphf: Mphf =
Mphf::new_from_par_iter(n, keys.map(|k| k.raw()).par_bridge(), PtrHashParams::<CubicEps>::default());
mphf.store(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
// Pass 2 — sequential: fill fingerprint.bin + callback
let unitigs2 = UnitigFileReader::open_sequential(&unitig_path)?;
let mut fw = FingerprintVecWriter::new(n, *b);
let mut seen = vec![0u8; (n + 7) / 8];
for kmer in unitigs2.iter_canonical_kmers() { for kmer in unitigs2.iter_canonical_kmers() {
let slot = mphf.index(&kmer.raw()); let slot = mphf.index(&kmer.raw());
if slot >= n { if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
return Err(OLMError::Mphf("slot out of bounds".into())); let byte = slot / 8; let bit = 1u8 << (slot % 8);
} if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
let byte = slot / 8;
let bit = 1u8 << (slot % 8);
if seen[byte] & bit != 0 {
return Err(OLMError::Mphf("duplicate slot".into()));
}
seen[byte] |= bit; seen[byte] |= bit;
fw.set(slot, kmer.seq_hash()); fw.set(slot, kmer.seq_hash());
fill_slot(slot, kmer)?; fill_slot(slot, kmer)?;
} }
fw.write(&dir.join(FINGERPRINT_FILE))?; fw.write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(*b, *z).save(dir)?; }
Ok(n)
IndexMode::Hybrid { b, .. } => {
let mut ev = EvidenceWriter::new(n);
let mut fw = FingerprintVecWriter::new(n, *b);
for (kmer, chunk_id, rank) in unitigs2.iter_indexed_canonical_kmers() {
let slot = mphf.index(&kmer.raw());
if slot >= n { return Err(OLMError::Mphf("slot out of bounds".into())); }
let byte = slot / 8; let bit = 1u8 << (slot % 8);
if seen[byte] & bit != 0 { return Err(OLMError::Mphf("duplicate slot".into())); }
seen[byte] |= bit;
ev.set(slot, chunk_id as u32, rank as u8);
fw.set(slot, kmer.seq_hash());
fill_slot(slot, kmer)?;
}
ev.write(&dir.join(EVIDENCE_FILE))?;
fw.write(&dir.join(FINGERPRINT_FILE))?;
build_unitig_idx(&unitig_path, block_bits)?;
} }
} }
Ok(n)
} }
} }
+81 -50
View File
@@ -198,11 +198,16 @@ pub fn build_unitig_idx(unitigs_path: &Path, block_bits: u8) -> SKResult<()> {
// ── Reader ──────────────────────────────────────────────────────────────────── // ── Reader ────────────────────────────────────────────────────────────────────
/// Read-only random-access view of a unitig file. /// Memory-mapped view of a unitig file, with optional direct-access index.
/// ///
/// The sequence file is memory-mapped; the block offset table is loaded into RAM /// Three constructors select the operating mode:
/// on open. Random access to chunk `i`: O(1 << block_bits) sequential mmap /// - [`open`](Self::open) — smart default: direct access if `.idx` exists, sequential otherwise.
/// reads. Sequential iteration: O(n) via a running-offset cursor. /// - [`open_sequential`](Self::open_sequential) — always sequential, ignores `.idx`.
/// - [`open_direct_access`](Self::open_direct_access) — requires `.idx`, errors if absent.
///
/// All positional methods (`chunk_start`, `verify_canonical_kmer`, …) work in
/// both modes. Without `.idx` they fall back to an O(i) sequential scan —
/// correct but slower.
pub struct UnitigFileReader { pub struct UnitigFileReader {
mmap: Mmap, mmap: Mmap,
block_offsets: Vec<u32>, block_offsets: Vec<u32>,
@@ -214,8 +219,52 @@ pub struct UnitigFileReader {
} }
impl UnitigFileReader { impl UnitigFileReader {
/// Open with `.idx` — enables both sequential iteration and random access. /// Smart default: opens with direct access if `.idx` is present, sequential otherwise.
pub fn open(path: &Path) -> SKResult<Self> { pub fn open(path: &Path) -> SKResult<Self> {
if idx_path(path).exists() {
Self::open_direct_access(path)
} else {
Self::open_sequential(path)
}
}
/// Always sequential — never reads `.idx` even if present.
///
/// Scans the binary file once to count chunks and k-mers.
/// Positional access (`chunk_start`, `verify_canonical_kmer`) falls back to
/// O(i) sequential scan.
pub fn open_sequential(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let k = obikseq::params::k();
let mut offset = 0usize;
let mut n_unitigs = 0usize;
let mut n_kmers = 0usize;
while offset < mmap.len() {
let seql_minus_k = mmap[offset] as usize;
n_kmers += seql_minus_k + 1;
offset += 1 + (seql_minus_k + k + 3) / 4;
n_unitigs += 1;
}
Ok(Self {
mmap,
block_offsets: Vec::new(),
n_unitigs,
n_kmers,
k,
block_bits: DEFAULT_BLOCK_BITS,
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
})
}
/// Requires `.idx` — errors if the companion index file is absent.
///
/// Enables O(1 << block_bits) positional access to any chunk.
/// Use only when direct access is architecturally required (query-time
/// verification on an exact-evidence layer).
pub fn open_direct_access(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?; let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? }; let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?; let (n_unitigs, n_kmers, block_bits, block_offsets) = read_idx(&idx_path(path))?;
@@ -231,58 +280,38 @@ impl UnitigFileReader {
}) })
} }
/// Open without `.idx` — sequential iteration only, no random access.
///
/// Scans the binary file once to count chunks and k-mers. Use when only
/// [`Self::iter_kmers`], [`Self::iter_canonical_kmers`], or
/// [`Self::iter_indexed_canonical_kmers`] are needed.
pub fn open_sequential(path: &Path) -> SKResult<Self> {
let file = File::open(path).map_err(SKError::Io)?;
let mmap = unsafe { Mmap::map(&file).map_err(SKError::Io)? };
let k = obikseq::params::k();
let mut offset = 0usize;
let mut n_unitigs = 0usize;
let mut n_kmers = 0usize;
while offset < mmap.len() {
let seql_minus_k = mmap[offset] as usize;
n_kmers += seql_minus_k + 1;
offset += 1 + (seql_minus_k + k + 3) / 4;
n_unitigs += 1;
}
Ok(Self {
mmap,
block_offsets: Vec::new(), // empty → random access disabled
n_unitigs,
n_kmers,
k,
block_bits: DEFAULT_BLOCK_BITS,
mask: (1usize << DEFAULT_BLOCK_BITS) - 1,
})
}
pub fn len(&self) -> usize { self.n_unitigs } pub fn len(&self) -> usize { self.n_unitigs }
pub fn is_empty(&self) -> bool { self.n_unitigs == 0 } pub fn is_empty(&self) -> bool { self.n_unitigs == 0 }
pub fn n_kmers(&self) -> usize { self.n_kmers } pub fn n_kmers(&self) -> usize { self.n_kmers }
pub fn block_bits(&self) -> u8 { self.block_bits } pub fn block_bits(&self) -> u8 { self.block_bits }
pub fn has_direct_access(&self) -> bool { !self.block_offsets.is_empty() }
/// Byte offset of the START of record `i` (the seql byte) in the mmap. /// Byte offset of record `i` in the mmap.
///
/// Fast path (O(1 << block_bits)) when `.idx` is loaded; degraded O(i)
/// sequential scan otherwise.
#[inline] #[inline]
fn chunk_start(&self, i: usize) -> usize { fn chunk_start(&self, i: usize) -> usize {
assert!(!self.block_offsets.is_empty(), if !self.block_offsets.is_empty() {
"random access requires UnitigFileReader::open(); use open_sequential() for iteration only"); if self.block_bits == 0 {
if self.block_bits == 0 { return self.block_offsets[i] as usize;
return self.block_offsets[i] as usize; }
let block = i >> self.block_bits;
let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
} else {
let mut offset = 0usize;
for _ in 0..i {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
} }
let block = i >> self.block_bits;
let rem = i & self.mask;
let mut offset = self.block_offsets[block] as usize;
for _ in 0..rem {
let seql_minus_k = self.mmap[offset] as usize;
offset += 1 + (seql_minus_k + self.k + 3) / 4;
}
offset
} }
/// Nucleotide length of chunk `i`. /// Nucleotide length of chunk `i`.
@@ -307,7 +336,9 @@ impl UnitigFileReader {
extract_kmer_raw(&self.mmap[offset + 1..], j, self.k) extract_kmer_raw(&self.mmap[offset + 1..], j, self.k)
} }
/// `true` iff the k-mer at position `j` of chunk `i` equals `query` (canonical). /// `true` iff the k-mer at position `j` of chunk `i` matches `query`.
///
/// Works in both modes; O(i) scan when `.idx` is absent.
#[inline] #[inline]
pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool { pub fn verify_canonical_kmer(&self, i: usize, j: usize, query: CanonicalKmer) -> bool {
canonical_raw(self.raw_kmer(i, j), self.k) == query.raw() canonical_raw(self.raw_kmer(i, j), self.k) == query.raw()