feat: add approximate kmer fingerprinting with memory-mapped storage
Introduce a new `fingerprint` module that stores packed B-bit vectors via memory-mapped files. Expose the module publicly and add `build_approx_evidence` to `Layer` and `MphfLayer` for generating compact `fingerprint.bin` files. Implement `find_approx` for fast, probabilistic kmer lookups using configurable bit-widths. Update dependencies to `bitvec` v1 and add `cacheline-ef`, `epserde`, and `memmap2` to support the new storage and serialization logic.
This commit is contained in:
Generated
+1
@@ -1592,6 +1592,7 @@ dependencies = [
|
||||
name = "obilayeredmap"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"bitvec",
|
||||
"cacheline-ef",
|
||||
"epserde",
|
||||
"memmap2",
|
||||
|
||||
@@ -12,6 +12,7 @@ cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
rayon = "1"
|
||||
ndarray = "0.16"
|
||||
bitvec = "1"
|
||||
memmap2 = "0.9"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -0,0 +1,151 @@
|
||||
// Packed B-bit fingerprint vector, one entry per MPHF slot.
|
||||
//
|
||||
// File format (fingerprint.bin):
|
||||
// magic: b"FPVF" (4 bytes)
|
||||
// b: u8 (bits per fingerprint, 1..=64)
|
||||
// padding: [0u8; 3]
|
||||
// n: u64 LE (number of slots)
|
||||
// data: packed bits, ceil(n*b/8) bytes, Lsb0 order
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::Path;
|
||||
|
||||
use bitvec::prelude::*;
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
|
||||
const MAGIC: &[u8; 4] = b"FPVF";
|
||||
const HEADER: usize = 16;
|
||||
|
||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct FingerprintVec {
|
||||
_mmap: Mmap,
|
||||
bits: &'static BitSlice<u8, Lsb0>,
|
||||
n: usize,
|
||||
b: u8,
|
||||
mask: u64,
|
||||
}
|
||||
|
||||
impl FingerprintVec {
|
||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||
let f = File::open(path)?;
|
||||
let mmap = unsafe { Mmap::map(&f)? };
|
||||
if mmap.len() < HEADER || &mmap[..4] != MAGIC {
|
||||
return Err(OLMError::InvalidLayer("bad fingerprint magic".into()));
|
||||
}
|
||||
let b = mmap[4];
|
||||
if b == 0 || b > 64 {
|
||||
return Err(OLMError::InvalidLayer("invalid fingerprint width".into()));
|
||||
}
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 };
|
||||
// SAFETY: the mmap lives as long as Self (kept via _mmap); data is read-only.
|
||||
let data: &'static [u8] = unsafe {
|
||||
std::slice::from_raw_parts(mmap[HEADER..].as_ptr(), (n * b as usize + 7) / 8)
|
||||
};
|
||||
let bits = BitSlice::<u8, Lsb0>::from_slice(data);
|
||||
Ok(Self { _mmap: mmap, bits, n, b, mask })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, slot: usize) -> u64 {
|
||||
debug_assert!(slot < self.n);
|
||||
let lo = slot * self.b as usize;
|
||||
self.bits[lo .. lo + self.b as usize].load_le::<u64>()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn matches(&self, slot: usize, fingerprint: u64) -> bool {
|
||||
self.get(slot) == (fingerprint & self.mask)
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
pub fn b(&self) -> u8 { self.b }
|
||||
}
|
||||
|
||||
// ── Writer ────────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct FingerprintVecWriter {
|
||||
buf: Vec<u8>,
|
||||
n: usize,
|
||||
b: u8,
|
||||
}
|
||||
|
||||
impl FingerprintVecWriter {
|
||||
pub fn new(n: usize, b: u8) -> Self {
|
||||
assert!(b > 0 && b <= 64, "fingerprint width must be 1..=64");
|
||||
let data_bytes = (n * b as usize + 7) / 8;
|
||||
Self { buf: vec![0u8; data_bytes], n, b }
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn set(&mut self, slot: usize, fingerprint: u64) {
|
||||
debug_assert!(slot < self.n);
|
||||
let lo = slot * self.b as usize;
|
||||
let bits = BitSlice::<u8, Lsb0>::from_slice_mut(&mut self.buf);
|
||||
bits[lo .. lo + self.b as usize].store_le(fingerprint);
|
||||
}
|
||||
|
||||
pub fn write(self, path: &Path) -> OLMResult<()> {
|
||||
let mut f = BufWriter::new(File::create(path).map_err(OLMError::Io)?);
|
||||
f.write_all(MAGIC).map_err(OLMError::Io)?;
|
||||
f.write_all(&[self.b, 0, 0, 0]).map_err(OLMError::Io)?;
|
||||
f.write_all(&(self.n as u64).to_le_bytes()).map_err(OLMError::Io)?;
|
||||
f.write_all(&self.buf).map_err(OLMError::Io)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn roundtrip(n: usize, b: u8, values: &[u64]) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("fp.bin");
|
||||
let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 };
|
||||
let mut w = FingerprintVecWriter::new(n, b);
|
||||
for (i, &v) in values.iter().enumerate() {
|
||||
w.set(i, v);
|
||||
}
|
||||
w.write(&path).unwrap();
|
||||
let r = FingerprintVec::open(&path).unwrap();
|
||||
assert_eq!(r.n(), n);
|
||||
assert_eq!(r.b(), b);
|
||||
for (i, &v) in values.iter().enumerate() {
|
||||
assert_eq!(r.get(i), v & mask, "slot {i} b={b}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn roundtrip_b1() { roundtrip(8, 1, &[1, 0, 1, 1, 0, 0, 1, 0]); }
|
||||
#[test]
|
||||
fn roundtrip_b8() { roundtrip(4, 8, &[0, 127, 200, 255]); }
|
||||
#[test]
|
||||
fn roundtrip_b16() { roundtrip(3, 16, &[0, 0xABCD, 0xFFFF]); }
|
||||
#[test]
|
||||
fn roundtrip_b7() { roundtrip(5, 7, &[0, 63, 127, 1, 42]); }
|
||||
#[test]
|
||||
fn roundtrip_b13_unaligned() {
|
||||
let vals: Vec<u64> = (0..20).map(|i| (i * 317) % (1 << 13)).collect();
|
||||
roundtrip(20, 13, &vals);
|
||||
}
|
||||
#[test]
|
||||
fn matches_returns_true_for_exact() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("fp.bin");
|
||||
let mut w = FingerprintVecWriter::new(4, 8);
|
||||
w.set(0, 42); w.set(1, 0); w.set(2, 255); w.set(3, 17);
|
||||
w.write(&path).unwrap();
|
||||
let r = FingerprintVec::open(&path).unwrap();
|
||||
assert!( r.matches(0, 42));
|
||||
assert!(!r.matches(0, 43));
|
||||
assert!( r.matches(2, 255));
|
||||
}
|
||||
}
|
||||
@@ -84,6 +84,14 @@ impl<D: LayerData> Layer<D> {
|
||||
pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult<usize> {
|
||||
MphfLayer::build_exact_evidence(layer_dir)
|
||||
}
|
||||
|
||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
|
||||
/// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64).
|
||||
///
|
||||
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
|
||||
pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult<usize> {
|
||||
MphfLayer::build_approx_evidence(layer_dir, b)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Mode 1 — set membership ───────────────────────────────────────────────────
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
pub mod error;
|
||||
pub mod evidence;
|
||||
pub mod fingerprint;
|
||||
pub mod layer;
|
||||
pub mod layered_store;
|
||||
pub mod map;
|
||||
|
||||
@@ -9,10 +9,12 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
use crate::evidence::{Evidence, EvidenceWriter};
|
||||
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
|
||||
|
||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
|
||||
pub(crate) const MPHF_FILE: &str = "mphf.bin";
|
||||
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
|
||||
pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
|
||||
|
||||
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
@@ -50,6 +52,17 @@ impl MphfLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise.
|
||||
///
|
||||
/// False positive rate per query: 1/2^b (where b is the fingerprint width
|
||||
/// used at build time). No `.idx` or `evidence.bin` needed at query time.
|
||||
#[inline]
|
||||
pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option<usize> {
|
||||
let slot = self.mphf.index(&kmer.raw());
|
||||
if slot >= self.n { return None; }
|
||||
if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
|
||||
}
|
||||
|
||||
pub fn n(&self) -> usize { self.n }
|
||||
|
||||
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
|
||||
@@ -108,6 +121,41 @@ impl MphfLayer {
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
|
||||
/// in `dir`. `b` is the number of fingerprint bits (1..=64).
|
||||
///
|
||||
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
|
||||
/// No `.idx` file is written — approximate evidence needs no random access.
|
||||
pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult<usize> {
|
||||
if b == 0 || b > 64 {
|
||||
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
|
||||
}
|
||||
let unitig_path = dir.join(UNITIGS_FILE);
|
||||
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
|
||||
let n = unitigs.n_kmers();
|
||||
|
||||
if n == 0 {
|
||||
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
let mut fw = FingerprintVecWriter::new(n, b);
|
||||
|
||||
for kmer in unitigs.iter_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("slot out of bounds".into()));
|
||||
}
|
||||
fw.set(slot, kmer.seq_hash());
|
||||
}
|
||||
|
||||
fw.write(&dir.join(FINGERPRINT_FILE))?;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
/// Build MPHF and exact evidence from the unitigs file already present in
|
||||
/// `dir`. Calls `fill_slot(slot, kmer)` once per kmer for DataStore
|
||||
/// population. Returns the number of kmers indexed.
|
||||
|
||||
Reference in New Issue
Block a user