diff --git a/TODO.md b/TODO.md index a874150..ceca1fe 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,21 @@ +## A finir dans le cadre de l'extension des index à une forme approximative + +- Il faut avoir un chemin explicite pour construire en mode exact avec des méthodes qui ont ce mot exact à l'intérieur. + - pub fn find_exact (src/obilayeredmap/src/mphf_layer.rs) + - pub fn build_exact_evidence (src/obilayeredmap/src/layer.rs) + +Comme elles existent actuellement pour le mode approx. + +Ensuite, il faudra définir des méthodes génériques +- find() +- build_evidence() + +qui utilise la bonne version suivant le mode de l'index de manière complètement transparente. +Avec ce système, tout le reste du code devrait être insensible au fait que l'on utilise un index exact ou approximatif. + +Sauf qu'avec un index approximatif, les résultats seront approximatifs. + + ## commandes à ajouter - aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne. diff --git a/src/obikindex/src/meta.rs b/src/obikindex/src/meta.rs index 6832cee..6d691f8 100644 --- a/src/obikindex/src/meta.rs +++ b/src/obikindex/src/meta.rs @@ -3,6 +3,7 @@ use std::fs; use std::io; use std::path::Path; +use obilayeredmap::EvidenceKind; use serde::{Deserialize, Serialize}; pub const META_FILENAME: &str = "index.meta"; @@ -28,6 +29,8 @@ pub struct IndexConfig { pub minimizer_size: usize, pub n_bits: usize, pub with_counts: bool, + #[serde(default)] + pub evidence: EvidenceKind, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index b67709f..04f95ec 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -86,11 +86,12 @@ impl Layer { } /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already - /// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64). + /// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere + /// consecutive k-mer parameter (≥1). /// /// See [`MphfLayer::build_approx_evidence`] for the full contract. - pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult { - MphfLayer::build_approx_evidence(layer_dir, b) + pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult { + MphfLayer::build_approx_evidence(layer_dir, b, z) } } diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs index dacbe7a..98ca3c8 100644 --- a/src/obilayeredmap/src/lib.rs +++ b/src/obilayeredmap/src/lib.rs @@ -11,4 +11,5 @@ pub use error::{OLMError, OLMResult}; pub use layer::{Hit, Layer, LayerData}; pub use layered_store::LayeredStore; pub use map::LayeredMap; +pub use meta::{EvidenceKind, LayerMeta}; pub use mphf_layer::MphfLayer; diff --git a/src/obilayeredmap/src/meta.rs b/src/obilayeredmap/src/meta.rs index 12e17f7..bc8d93e 100644 --- a/src/obilayeredmap/src/meta.rs +++ b/src/obilayeredmap/src/meta.rs @@ -5,7 +5,55 @@ use serde::{Deserialize, Serialize}; use crate::error::OLMResult; -const META_FILE: &str = "meta.json"; +const META_FILE: &str = "meta.json"; +const LAYER_META_FILE: &str = "layer_meta.json"; + +// ── Layer-level metadata ────────────────────────────────────────────────────── + +/// Describes the evidence bundle stored alongside the MPHF for one layer. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", rename_all = "snake_case")] +pub enum EvidenceKind { + /// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives. + Exact, + /// Approximate evidence: `fingerprint.bin` only. + /// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b. + /// `z` — consecutive k-mers that must all match (Findere trick); + /// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows. + Approx { b: u8, z: u8 }, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LayerMeta { + pub evidence: EvidenceKind, +} + +impl Default for EvidenceKind { + fn default() -> Self { Self::Exact } +} + +impl LayerMeta { + pub fn exact() -> Self { + Self { evidence: EvidenceKind::Exact } + } + + pub fn approx(b: u8, z: u8) -> Self { + Self { evidence: EvidenceKind::Approx { b, z } } + } + + pub fn load(layer_dir: &Path) -> OLMResult { + let f = File::open(layer_dir.join(LAYER_META_FILE))?; + Ok(serde_json::from_reader(f)?) + } + + pub fn save(&self, layer_dir: &Path) -> OLMResult<()> { + let f = File::create(layer_dir.join(LAYER_META_FILE))?; + serde_json::to_writer_pretty(f, self)?; + Ok(()) + } +} + +// ── Partition-level metadata ────────────────────────────────────────────────── #[derive(Debug, Clone, Serialize, Deserialize)] pub struct PartitionMeta { diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index a025784..e5d4cf2 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -10,6 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; +use crate::meta::LayerMeta; pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; @@ -118,24 +119,30 @@ impl MphfLayer { ev.write(&dir.join(EVIDENCE_FILE))?; // Write .idx last: it is only needed for random access (queries). build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; + LayerMeta::exact().save(dir)?; Ok(n) } /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present - /// in `dir`. `b` is the number of fingerprint bits (1..=64). + /// in `dir`. `b` — fingerprint bits (1..=64); `z` — Findere consecutive + /// k-mer parameter (≥1). /// /// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`. /// No `.idx` file is written — approximate evidence needs no random access. - pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult { + pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult { if b == 0 || b > 64 { return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); } + if z == 0 { + return Err(OLMError::InvalidLayer("z must be ≥ 1".into())); + } let unitig_path = dir.join(UNITIGS_FILE); let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; let n = unitigs.n_kmers(); if n == 0 { FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; + LayerMeta::approx(b, z).save(dir)?; return Ok(0); } @@ -153,6 +160,7 @@ impl MphfLayer { } fw.write(&dir.join(FINGERPRINT_FILE))?; + LayerMeta::approx(b, z).save(dir)?; Ok(n) } @@ -214,6 +222,7 @@ impl MphfLayer { } ev.write(&dir.join(EVIDENCE_FILE))?; + LayerMeta::exact().save(dir)?; Ok(n) } }