From e1dab86daf06b87cc4ba0100236f29d114ac42ec Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Sat, 23 May 2026 08:48:59 +0200 Subject: [PATCH] feat: add approximate kmer fingerprinting with memory-mapped storage Introduce a new `fingerprint` module that stores packed B-bit vectors via memory-mapped files. Expose the module publicly and add `build_approx_evidence` to `Layer` and `MphfLayer` for generating compact `fingerprint.bin` files. Implement `find_approx` for fast, probabilistic kmer lookups using configurable bit-widths. Update dependencies to `bitvec` v1 and add `cacheline-ef`, `epserde`, and `memmap2` to support the new storage and serialization logic. --- src/Cargo.lock | 1 + src/obilayeredmap/Cargo.toml | 1 + src/obilayeredmap/src/fingerprint.rs | 151 +++++++++++++++++++++++++++ src/obilayeredmap/src/layer.rs | 8 ++ src/obilayeredmap/src/lib.rs | 1 + src/obilayeredmap/src/mphf_layer.rs | 54 +++++++++- 6 files changed, 213 insertions(+), 3 deletions(-) create mode 100644 src/obilayeredmap/src/fingerprint.rs diff --git a/src/Cargo.lock b/src/Cargo.lock index 93e5039..bc710eb 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1592,6 +1592,7 @@ dependencies = [ name = "obilayeredmap" version = "0.1.0" dependencies = [ + "bitvec", "cacheline-ef", "epserde", "memmap2", diff --git a/src/obilayeredmap/Cargo.toml b/src/obilayeredmap/Cargo.toml index 1bb3d94..2f94494 100644 --- a/src/obilayeredmap/Cargo.toml +++ b/src/obilayeredmap/Cargo.toml @@ -12,6 +12,7 @@ cacheline-ef = "1.1" epserde = "0.8" rayon = "1" ndarray = "0.16" +bitvec = "1" memmap2 = "0.9" serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/src/obilayeredmap/src/fingerprint.rs b/src/obilayeredmap/src/fingerprint.rs new file mode 100644 index 0000000..ff0701e --- /dev/null +++ b/src/obilayeredmap/src/fingerprint.rs @@ -0,0 +1,151 @@ +// Packed B-bit fingerprint vector, one entry per MPHF slot. +// +// File format (fingerprint.bin): +// magic: b"FPVF" (4 bytes) +// b: u8 (bits per fingerprint, 1..=64) +// padding: [0u8; 3] +// n: u64 LE (number of slots) +// data: packed bits, ceil(n*b/8) bytes, Lsb0 order + +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; + +use bitvec::prelude::*; +use memmap2::Mmap; + +use crate::error::{OLMError, OLMResult}; + +const MAGIC: &[u8; 4] = b"FPVF"; +const HEADER: usize = 16; + +// ── Reader ──────────────────────────────────────────────────────────────────── + +pub struct FingerprintVec { + _mmap: Mmap, + bits: &'static BitSlice, + n: usize, + b: u8, + mask: u64, +} + +impl FingerprintVec { + pub fn open(path: &Path) -> OLMResult { + let f = File::open(path)?; + let mmap = unsafe { Mmap::map(&f)? }; + if mmap.len() < HEADER || &mmap[..4] != MAGIC { + return Err(OLMError::InvalidLayer("bad fingerprint magic".into())); + } + let b = mmap[4]; + if b == 0 || b > 64 { + return Err(OLMError::InvalidLayer("invalid fingerprint width".into())); + } + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 }; + // SAFETY: the mmap lives as long as Self (kept via _mmap); data is read-only. + let data: &'static [u8] = unsafe { + std::slice::from_raw_parts(mmap[HEADER..].as_ptr(), (n * b as usize + 7) / 8) + }; + let bits = BitSlice::::from_slice(data); + Ok(Self { _mmap: mmap, bits, n, b, mask }) + } + + #[inline] + pub fn get(&self, slot: usize) -> u64 { + debug_assert!(slot < self.n); + let lo = slot * self.b as usize; + self.bits[lo .. lo + self.b as usize].load_le::() + } + + #[inline] + pub fn matches(&self, slot: usize, fingerprint: u64) -> bool { + self.get(slot) == (fingerprint & self.mask) + } + + pub fn n(&self) -> usize { self.n } + pub fn b(&self) -> u8 { self.b } +} + +// ── Writer ──────────────────────────────────────────────────────────────────── + +pub struct FingerprintVecWriter { + buf: Vec, + n: usize, + b: u8, +} + +impl FingerprintVecWriter { + pub fn new(n: usize, b: u8) -> Self { + assert!(b > 0 && b <= 64, "fingerprint width must be 1..=64"); + let data_bytes = (n * b as usize + 7) / 8; + Self { buf: vec![0u8; data_bytes], n, b } + } + + #[inline] + pub fn set(&mut self, slot: usize, fingerprint: u64) { + debug_assert!(slot < self.n); + let lo = slot * self.b as usize; + let bits = BitSlice::::from_slice_mut(&mut self.buf); + bits[lo .. lo + self.b as usize].store_le(fingerprint); + } + + pub fn write(self, path: &Path) -> OLMResult<()> { + let mut f = BufWriter::new(File::create(path).map_err(OLMError::Io)?); + f.write_all(MAGIC).map_err(OLMError::Io)?; + f.write_all(&[self.b, 0, 0, 0]).map_err(OLMError::Io)?; + f.write_all(&(self.n as u64).to_le_bytes()).map_err(OLMError::Io)?; + f.write_all(&self.buf).map_err(OLMError::Io)?; + Ok(()) + } +} + +// ── Tests ───────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn roundtrip(n: usize, b: u8, values: &[u64]) { + let dir = tempdir().unwrap(); + let path = dir.path().join("fp.bin"); + let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 }; + let mut w = FingerprintVecWriter::new(n, b); + for (i, &v) in values.iter().enumerate() { + w.set(i, v); + } + w.write(&path).unwrap(); + let r = FingerprintVec::open(&path).unwrap(); + assert_eq!(r.n(), n); + assert_eq!(r.b(), b); + for (i, &v) in values.iter().enumerate() { + assert_eq!(r.get(i), v & mask, "slot {i} b={b}"); + } + } + + #[test] + fn roundtrip_b1() { roundtrip(8, 1, &[1, 0, 1, 1, 0, 0, 1, 0]); } + #[test] + fn roundtrip_b8() { roundtrip(4, 8, &[0, 127, 200, 255]); } + #[test] + fn roundtrip_b16() { roundtrip(3, 16, &[0, 0xABCD, 0xFFFF]); } + #[test] + fn roundtrip_b7() { roundtrip(5, 7, &[0, 63, 127, 1, 42]); } + #[test] + fn roundtrip_b13_unaligned() { + let vals: Vec = (0..20).map(|i| (i * 317) % (1 << 13)).collect(); + roundtrip(20, 13, &vals); + } + #[test] + fn matches_returns_true_for_exact() { + let dir = tempdir().unwrap(); + let path = dir.path().join("fp.bin"); + let mut w = FingerprintVecWriter::new(4, 8); + w.set(0, 42); w.set(1, 0); w.set(2, 255); w.set(3, 17); + w.write(&path).unwrap(); + let r = FingerprintVec::open(&path).unwrap(); + assert!( r.matches(0, 42)); + assert!(!r.matches(0, 43)); + assert!( r.matches(2, 255)); + } +} diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index a25ee62..b67709f 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -84,6 +84,14 @@ impl Layer { pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult { MphfLayer::build_exact_evidence(layer_dir) } + + /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already + /// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64). + /// + /// See [`MphfLayer::build_approx_evidence`] for the full contract. + pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult { + MphfLayer::build_approx_evidence(layer_dir, b) + } } // ── Mode 1 — set membership ─────────────────────────────────────────────────── diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs index a883041..dacbe7a 100644 --- a/src/obilayeredmap/src/lib.rs +++ b/src/obilayeredmap/src/lib.rs @@ -1,5 +1,6 @@ pub mod error; pub mod evidence; +pub mod fingerprint; pub mod layer; pub mod layered_store; pub mod map; diff --git a/src/obilayeredmap/src/mphf_layer.rs b/src/obilayeredmap/src/mphf_layer.rs index 1713bce..a025784 100644 --- a/src/obilayeredmap/src/mphf_layer.rs +++ b/src/obilayeredmap/src/mphf_layer.rs @@ -9,10 +9,12 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64}; use crate::error::{OLMError, OLMResult}; use crate::evidence::{Evidence, EvidenceWriter}; +use crate::fingerprint::{FingerprintVec, FingerprintVecWriter}; -pub(crate) const MPHF_FILE: &str = "mphf.bin"; -pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; -pub(crate) const EVIDENCE_FILE: &str = "evidence.bin"; +pub(crate) const MPHF_FILE: &str = "mphf.bin"; +pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; +pub(crate) const EVIDENCE_FILE: &str = "evidence.bin"; +pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin"; pub(crate) type Mphf = PtrHash>, Xx64, Vec>; @@ -50,6 +52,17 @@ impl MphfLayer { } } + /// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise. + /// + /// False positive rate per query: 1/2^b (where b is the fingerprint width + /// used at build time). No `.idx` or `evidence.bin` needed at query time. + #[inline] + pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option { + let slot = self.mphf.index(&kmer.raw()); + if slot >= self.n { return None; } + if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None } + } + pub fn n(&self) -> usize { self.n } pub fn unitig_writer(dir: &Path) -> OLMResult { @@ -108,6 +121,41 @@ impl MphfLayer { Ok(n) } + /// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present + /// in `dir`. `b` is the number of fingerprint bits (1..=64). + /// + /// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`. + /// No `.idx` file is written — approximate evidence needs no random access. + pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult { + if b == 0 || b > 64 { + return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into())); + } + let unitig_path = dir.join(UNITIGS_FILE); + let unitigs = UnitigFileReader::open_sequential(&unitig_path)?; + let n = unitigs.n_kmers(); + + if n == 0 { + FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?; + return Ok(0); + } + + let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE)) + .map_err(|e| OLMError::InvalidLayer(e.to_string()))?; + + let mut fw = FingerprintVecWriter::new(n, b); + + for kmer in unitigs.iter_canonical_kmers() { + let slot = mphf.index(&kmer.raw()); + if slot >= n { + return Err(OLMError::Mphf("slot out of bounds".into())); + } + fw.set(slot, kmer.seq_hash()); + } + + fw.write(&dir.join(FINGERPRINT_FILE))?; + Ok(n) + } + /// Build MPHF and exact evidence from the unitigs file already present in /// `dir`. Calls `fill_slot(slot, kmer)` once per kmer for DataStore /// population. Returns the number of kmers indexed.