Push kztouvrzoqym #8

Merged
coissac merged 10 commits from push-kztouvrzoqym into main 2026-05-23 12:04:51 +00:00
6 changed files with 86 additions and 6 deletions
Showing only changes of commit 16a6b0d033 - Show all commits
+18
View File
@@ -1,3 +1,21 @@
## A finir dans le cadre de l'extension des index à une forme approximative
- Il faut avoir un chemin explicite pour construire en mode exact avec des méthodes qui ont ce mot exact à l'intérieur.
- pub fn find_exact (src/obilayeredmap/src/mphf_layer.rs)
- pub fn build_exact_evidence (src/obilayeredmap/src/layer.rs)
Comme elles existent actuellement pour le mode approx.
Ensuite, il faudra définir des méthodes génériques
- find()
- build_evidence()
qui utilise la bonne version suivant le mode de l'index de manière complètement transparente.
Avec ce système, tout le reste du code devrait être insensible au fait que l'on utilise un index exact ou approximatif.
Sauf qu'avec un index approximatif, les résultats seront approximatifs.
## commandes à ajouter ## commandes à ajouter
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne. - aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
+3
View File
@@ -3,6 +3,7 @@ use std::fs;
use std::io; use std::io;
use std::path::Path; use std::path::Path;
use obilayeredmap::EvidenceKind;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub const META_FILENAME: &str = "index.meta"; pub const META_FILENAME: &str = "index.meta";
@@ -28,6 +29,8 @@ pub struct IndexConfig {
pub minimizer_size: usize, pub minimizer_size: usize,
pub n_bits: usize, pub n_bits: usize,
pub with_counts: bool, pub with_counts: bool,
#[serde(default)]
pub evidence: EvidenceKind,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
+4 -3
View File
@@ -86,11 +86,12 @@ impl<D: LayerData> Layer<D> {
} }
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
/// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64). /// present in `layer_dir`. `b` fingerprint bits (1..=64); `z` — Findere
/// consecutive k-mer parameter (≥1).
/// ///
/// See [`MphfLayer::build_approx_evidence`] for the full contract. /// See [`MphfLayer::build_approx_evidence`] for the full contract.
pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult<usize> { pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
MphfLayer::build_approx_evidence(layer_dir, b) MphfLayer::build_approx_evidence(layer_dir, b, z)
} }
} }
+1
View File
@@ -11,4 +11,5 @@ pub use error::{OLMError, OLMResult};
pub use layer::{Hit, Layer, LayerData}; pub use layer::{Hit, Layer, LayerData};
pub use layered_store::LayeredStore; pub use layered_store::LayeredStore;
pub use map::LayeredMap; pub use map::LayeredMap;
pub use meta::{EvidenceKind, LayerMeta};
pub use mphf_layer::MphfLayer; pub use mphf_layer::MphfLayer;
+48
View File
@@ -6,6 +6,54 @@ use serde::{Deserialize, Serialize};
use crate::error::OLMResult; use crate::error::OLMResult;
const META_FILE: &str = "meta.json"; const META_FILE: &str = "meta.json";
const LAYER_META_FILE: &str = "layer_meta.json";
// ── Layer-level metadata ──────────────────────────────────────────────────────
/// Describes the evidence bundle stored alongside the MPHF for one layer.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum EvidenceKind {
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
Exact,
/// Approximate evidence: `fingerprint.bin` only.
/// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
/// `z` — consecutive k-mers that must all match (Findere trick);
/// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
Approx { b: u8, z: u8 },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerMeta {
pub evidence: EvidenceKind,
}
impl Default for EvidenceKind {
fn default() -> Self { Self::Exact }
}
impl LayerMeta {
pub fn exact() -> Self {
Self { evidence: EvidenceKind::Exact }
}
pub fn approx(b: u8, z: u8) -> Self {
Self { evidence: EvidenceKind::Approx { b, z } }
}
pub fn load(layer_dir: &Path) -> OLMResult<Self> {
let f = File::open(layer_dir.join(LAYER_META_FILE))?;
Ok(serde_json::from_reader(f)?)
}
pub fn save(&self, layer_dir: &Path) -> OLMResult<()> {
let f = File::create(layer_dir.join(LAYER_META_FILE))?;
serde_json::to_writer_pretty(f, self)?;
Ok(())
}
}
// ── Partition-level metadata ──────────────────────────────────────────────────
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PartitionMeta { pub struct PartitionMeta {
+11 -2
View File
@@ -10,6 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::error::{OLMError, OLMResult}; use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter}; use crate::evidence::{Evidence, EvidenceWriter};
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
use crate::meta::LayerMeta;
pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
@@ -118,24 +119,30 @@ impl MphfLayer {
ev.write(&dir.join(EVIDENCE_FILE))?; ev.write(&dir.join(EVIDENCE_FILE))?;
// Write .idx last: it is only needed for random access (queries). // Write .idx last: it is only needed for random access (queries).
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?; build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
LayerMeta::exact().save(dir)?;
Ok(n) Ok(n)
} }
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
/// in `dir`. `b` is the number of fingerprint bits (1..=64). /// in `dir`. `b` fingerprint bits (1..=64); `z` — Findere consecutive
/// k-mer parameter (≥1).
/// ///
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`. /// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
/// No `.idx` file is written — approximate evidence needs no random access. /// No `.idx` file is written — approximate evidence needs no random access.
pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult<usize> { pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
if b == 0 || b > 64 { if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
} }
if z == 0 {
return Err(OLMError::InvalidLayer("z must be ≥ 1".into()));
}
let unitig_path = dir.join(UNITIGS_FILE); let unitig_path = dir.join(UNITIGS_FILE);
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
let n = unitigs.n_kmers(); let n = unitigs.n_kmers();
if n == 0 { if n == 0 {
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
return Ok(0); return Ok(0);
} }
@@ -153,6 +160,7 @@ impl MphfLayer {
} }
fw.write(&dir.join(FINGERPRINT_FILE))?; fw.write(&dir.join(FINGERPRINT_FILE))?;
LayerMeta::approx(b, z).save(dir)?;
Ok(n) Ok(n)
} }
@@ -214,6 +222,7 @@ impl MphfLayer {
} }
ev.write(&dir.join(EVIDENCE_FILE))?; ev.write(&dir.join(EVIDENCE_FILE))?;
LayerMeta::exact().save(dir)?;
Ok(n) Ok(n)
} }
} }