feat: add evidence metadata and configurable k-mer parameters

Introduces `EvidenceKind` and `LayerMeta` structs to manage per-layer evidence configuration and false-positive parameters. Adds JSON serialization for layer metadata persistence and updates `build_approx_evidence` to accept a `z` parameter for consecutive k-mer thresholds. Exposes these types publicly and documents a future `aggregate` command for merging index matrix columns.
This commit is contained in:
Eric Coissac
2026-05-23 08:59:11 +02:00
parent e1dab86daf
commit 16a6b0d033
6 changed files with 86 additions and 6 deletions
+4 -3
View File
@@ -86,11 +86,12 @@ impl<D: LayerData> Layer<D> {
}
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
/// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64).
/// present in `layer_dir`. `b` fingerprint bits (1..=64); `z` — Findere
/// consecutive k-mer parameter (≥1).
///
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult<usize> {
MphfLayer::build_approx_evidence(layer_dir, b)
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
MphfLayer::build_approx_evidence(layer_dir, b, z)
}
}
+1
View File
@@ -11,4 +11,5 @@ pub use error::{OLMError, OLMResult};
pub use layer::{Hit, Layer, LayerData};
pub use layered_store::LayeredStore;
pub use map::LayeredMap;
pub use meta::{EvidenceKind, LayerMeta};
pub use mphf_layer::MphfLayer;
+49 -1
View File
@@ -5,7 +5,55 @@ use serde::{Deserialize, Serialize};
use crate::error::OLMResult;
const META_FILE: &str = "meta.json";
const META_FILE: &str = "meta.json";
const LAYER_META_FILE: &str = "layer_meta.json";
// ── Layer-level metadata ──────────────────────────────────────────────────────
/// Describes the evidence bundle stored alongside the MPHF for one layer.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum EvidenceKind {
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
Exact,
/// Approximate evidence: `fingerprint.bin` only.
/// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
/// `z` — consecutive k-mers that must all match (Findere trick);
/// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
Approx { b: u8, z: u8 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerMeta {
pub evidence: EvidenceKind,
}
impl Default for EvidenceKind {
fn default() -> Self { Self::Exact }
}
impl LayerMeta {
pub fn exact() -> Self {
Self { evidence: EvidenceKind::Exact }
}
pub fn approx(b: u8, z: u8) -> Self {
Self { evidence: EvidenceKind::Approx { b, z } }
}
pub fn load(layer_dir: &Path) -> OLMResult<Self> {
let f = File::open(layer_dir.join(LAYER_META_FILE))?;
Ok(serde_json::from_reader(f)?)
}
pub fn save(&self, layer_dir: &Path) -> OLMResult<()> {
let f = File::create(layer_dir.join(LAYER_META_FILE))?;
serde_json::to_writer_pretty(f, self)?;
Ok(())
}
}
// ── Partition-level metadata ──────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartitionMeta {
+11 -2
View File
@@ -10,6 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter};
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
use crate::meta::LayerMeta;
pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
@@ -118,24 +119,30 @@ impl MphfLayer {
ev.write(&dir.join(EVIDENCE_FILE))?;
// Write .idx last: it is only needed for random access (queries).
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
LayerMeta::exact().save(dir)?;
Ok(n)
}
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
/// in `dir`. `b` is the number of fingerprint bits (1..=64).
/// in `dir`. `b` fingerprint bits (1..=64); `z` — Findere consecutive
/// k-mer parameter (≥1).
///
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
/// No `.idx` file is written — approximate evidence needs no random access.
pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult<usize> {
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
}
if z == 0 {
return Err(OLMError::InvalidLayer("z must be ≥ 1".into()));
}
let unitig_path = dir.join(UNITIGS_FILE);
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
let n = unitigs.n_kmers();
if n == 0 {
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
return Ok(0);
}
@@ -153,6 +160,7 @@ impl MphfLayer {
}
fw.write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
Ok(n)
}
@@ -214,6 +222,7 @@ impl MphfLayer {
}
ev.write(&dir.join(EVIDENCE_FILE))?;
LayerMeta::exact().save(dir)?;
Ok(n)
}
}