Push kztouvrzoqym #8
@@ -1,3 +1,21 @@
|
|||||||
|
## A finir dans le cadre de l'extension des index à une forme approximative
|
||||||
|
|
||||||
|
- Il faut avoir un chemin explicite pour construire en mode exact avec des méthodes qui ont ce mot exact à l'intérieur.
|
||||||
|
- pub fn find_exact (src/obilayeredmap/src/mphf_layer.rs)
|
||||||
|
- pub fn build_exact_evidence (src/obilayeredmap/src/layer.rs)
|
||||||
|
|
||||||
|
Comme elles existent actuellement pour le mode approx.
|
||||||
|
|
||||||
|
Ensuite, il faudra définir des méthodes génériques
|
||||||
|
- find()
|
||||||
|
- build_evidence()
|
||||||
|
|
||||||
|
qui utilise la bonne version suivant le mode de l'index de manière complètement transparente.
|
||||||
|
Avec ce système, tout le reste du code devrait être insensible au fait que l'on utilise un index exact ou approximatif.
|
||||||
|
|
||||||
|
Sauf qu'avec un index approximatif, les résultats seront approximatifs.
|
||||||
|
|
||||||
|
|
||||||
## commandes à ajouter
|
## commandes à ajouter
|
||||||
|
|
||||||
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
|
- aggregate : aggrege toutes les colonnes d'une matrice d'index en une seule colonne.
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use std::fs;
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
|
use obilayeredmap::EvidenceKind;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub const META_FILENAME: &str = "index.meta";
|
pub const META_FILENAME: &str = "index.meta";
|
||||||
@@ -28,6 +29,8 @@ pub struct IndexConfig {
|
|||||||
pub minimizer_size: usize,
|
pub minimizer_size: usize,
|
||||||
pub n_bits: usize,
|
pub n_bits: usize,
|
||||||
pub with_counts: bool,
|
pub with_counts: bool,
|
||||||
|
#[serde(default)]
|
||||||
|
pub evidence: EvidenceKind,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -86,11 +86,12 @@ impl<D: LayerData> Layer<D> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
||||||
/// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64).
|
/// present in `layer_dir`. `b` — fingerprint bits (1..=64); `z` — Findere
|
||||||
|
/// consecutive k-mer parameter (≥1).
|
||||||
///
|
///
|
||||||
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
|
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
|
||||||
pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult<usize> {
|
pub fn build_approx_evidence(layer_dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||||
MphfLayer::build_approx_evidence(layer_dir, b)
|
MphfLayer::build_approx_evidence(layer_dir, b, z)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -11,4 +11,5 @@ pub use error::{OLMError, OLMResult};
|
|||||||
pub use layer::{Hit, Layer, LayerData};
|
pub use layer::{Hit, Layer, LayerData};
|
||||||
pub use layered_store::LayeredStore;
|
pub use layered_store::LayeredStore;
|
||||||
pub use map::LayeredMap;
|
pub use map::LayeredMap;
|
||||||
|
pub use meta::{EvidenceKind, LayerMeta};
|
||||||
pub use mphf_layer::MphfLayer;
|
pub use mphf_layer::MphfLayer;
|
||||||
|
|||||||
@@ -5,7 +5,55 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
use crate::error::OLMResult;
|
use crate::error::OLMResult;
|
||||||
|
|
||||||
const META_FILE: &str = "meta.json";
|
const META_FILE: &str = "meta.json";
|
||||||
|
const LAYER_META_FILE: &str = "layer_meta.json";
|
||||||
|
|
||||||
|
// ── Layer-level metadata ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Describes the evidence bundle stored alongside the MPHF for one layer.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(tag = "type", rename_all = "snake_case")]
|
||||||
|
pub enum EvidenceKind {
|
||||||
|
/// Exact evidence: `evidence.bin` + `unitigs.bin.idx`. Zero false positives.
|
||||||
|
Exact,
|
||||||
|
/// Approximate evidence: `fingerprint.bin` only.
|
||||||
|
/// `b` — fingerprint bits; false-positive rate per k-mer = 1/2^b.
|
||||||
|
/// `z` — consecutive k-mers that must all match (Findere trick);
|
||||||
|
/// effective FP rate per read ≈ (W / 2^(b*z)) where W = read windows.
|
||||||
|
Approx { b: u8, z: u8 },
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct LayerMeta {
|
||||||
|
pub evidence: EvidenceKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for EvidenceKind {
|
||||||
|
fn default() -> Self { Self::Exact }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LayerMeta {
|
||||||
|
pub fn exact() -> Self {
|
||||||
|
Self { evidence: EvidenceKind::Exact }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn approx(b: u8, z: u8) -> Self {
|
||||||
|
Self { evidence: EvidenceKind::Approx { b, z } }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load(layer_dir: &Path) -> OLMResult<Self> {
|
||||||
|
let f = File::open(layer_dir.join(LAYER_META_FILE))?;
|
||||||
|
Ok(serde_json::from_reader(f)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn save(&self, layer_dir: &Path) -> OLMResult<()> {
|
||||||
|
let f = File::create(layer_dir.join(LAYER_META_FILE))?;
|
||||||
|
serde_json::to_writer_pretty(f, self)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Partition-level metadata ──────────────────────────────────────────────────
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct PartitionMeta {
|
pub struct PartitionMeta {
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
|||||||
use crate::error::{OLMError, OLMResult};
|
use crate::error::{OLMError, OLMResult};
|
||||||
use crate::evidence::{Evidence, EvidenceWriter};
|
use crate::evidence::{Evidence, EvidenceWriter};
|
||||||
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
||||||
|
use crate::meta::LayerMeta;
|
||||||
|
|
||||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||||
@@ -118,24 +119,30 @@ impl MphfLayer {
|
|||||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||||
// Write .idx last: it is only needed for random access (queries).
|
// Write .idx last: it is only needed for random access (queries).
|
||||||
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
build_unitig_idx(&unitig_path, DEFAULT_BLOCK_BITS)?;
|
||||||
|
LayerMeta::exact().save(dir)?;
|
||||||
Ok(n)
|
Ok(n)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
|
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
|
||||||
/// in `dir`. `b` is the number of fingerprint bits (1..=64).
|
/// in `dir`. `b` — fingerprint bits (1..=64); `z` — Findere consecutive
|
||||||
|
/// k-mer parameter (≥1).
|
||||||
///
|
///
|
||||||
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
|
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
|
||||||
/// No `.idx` file is written — approximate evidence needs no random access.
|
/// No `.idx` file is written — approximate evidence needs no random access.
|
||||||
pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult<usize> {
|
pub fn build_approx_evidence(dir: &Path, b: u8, z: u8) -> OLMResult<usize> {
|
||||||
if b == 0 || b > 64 {
|
if b == 0 || b > 64 {
|
||||||
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
||||||
}
|
}
|
||||||
|
if z == 0 {
|
||||||
|
return Err(OLMError::InvalidLayer("z must be ≥ 1".into()));
|
||||||
|
}
|
||||||
let unitig_path = dir.join(UNITIGS_FILE);
|
let unitig_path = dir.join(UNITIGS_FILE);
|
||||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||||
let n = unitigs.n_kmers();
|
let n = unitigs.n_kmers();
|
||||||
|
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
|
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||||
|
LayerMeta::approx(b, z).save(dir)?;
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,6 +160,7 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
||||||
|
LayerMeta::approx(b, z).save(dir)?;
|
||||||
Ok(n)
|
Ok(n)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -214,6 +222,7 @@ impl MphfLayer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ev.write(&dir.join(EVIDENCE_FILE))?;
|
ev.write(&dir.join(EVIDENCE_FILE))?;
|
||||||
|
LayerMeta::exact().save(dir)?;
|
||||||
Ok(n)
|
Ok(n)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user