Push kztouvrzoqym #8

Merged
coissac merged 10 commits from push-kztouvrzoqym into main 2026-05-23 12:04:51 +00:00
6 changed files with 213 additions and 3 deletions
Showing only changes of commit e1dab86daf - Show all commits
+1
View File
@@ -1592,6 +1592,7 @@ dependencies = [
name = "obilayeredmap" name = "obilayeredmap"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"bitvec",
"cacheline-ef", "cacheline-ef",
"epserde", "epserde",
"memmap2", "memmap2",
+1
View File
@@ -12,6 +12,7 @@ cacheline-ef = "1.1"
epserde = "0.8" epserde = "0.8"
rayon = "1" rayon = "1"
ndarray = "0.16" ndarray = "0.16"
bitvec = "1"
memmap2 = "0.9" memmap2 = "0.9"
serde = { version = "1", features = ["derive"] } serde = { version = "1", features = ["derive"] }
serde_json = "1" serde_json = "1"
+151
View File
@@ -0,0 +1,151 @@
// Packed B-bit fingerprint vector, one entry per MPHF slot.
//
// File format (fingerprint.bin):
// magic: b"FPVF" (4 bytes)
// b: u8 (bits per fingerprint, 1..=64)
// padding: [0u8; 3]
// n: u64 LE (number of slots)
// data: packed bits, ceil(n*b/8) bytes, Lsb0 order
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::Path;
use bitvec::prelude::*;
use memmap2::Mmap;
use crate::error::{OLMError, OLMResult};
const MAGIC: &[u8; 4] = b"FPVF";
const HEADER: usize = 16;
// ── Reader ────────────────────────────────────────────────────────────────────
pub struct FingerprintVec {
_mmap: Mmap,
bits: &'static BitSlice<u8, Lsb0>,
n: usize,
b: u8,
mask: u64,
}
impl FingerprintVec {
pub fn open(path: &Path) -> OLMResult<Self> {
let f = File::open(path)?;
let mmap = unsafe { Mmap::map(&f)? };
if mmap.len() < HEADER || &mmap[..4] != MAGIC {
return Err(OLMError::InvalidLayer("bad fingerprint magic".into()));
}
let b = mmap[4];
if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("invalid fingerprint width".into()));
}
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 };
// SAFETY: the mmap lives as long as Self (kept via _mmap); data is read-only.
let data: &'static [u8] = unsafe {
std::slice::from_raw_parts(mmap[HEADER..].as_ptr(), (n * b as usize + 7) / 8)
};
let bits = BitSlice::<u8, Lsb0>::from_slice(data);
Ok(Self { _mmap: mmap, bits, n, b, mask })
}
#[inline]
pub fn get(&self, slot: usize) -> u64 {
debug_assert!(slot < self.n);
let lo = slot * self.b as usize;
self.bits[lo .. lo + self.b as usize].load_le::<u64>()
}
#[inline]
pub fn matches(&self, slot: usize, fingerprint: u64) -> bool {
self.get(slot) == (fingerprint & self.mask)
}
pub fn n(&self) -> usize { self.n }
pub fn b(&self) -> u8 { self.b }
}
// ── Writer ────────────────────────────────────────────────────────────────────
pub struct FingerprintVecWriter {
buf: Vec<u8>,
n: usize,
b: u8,
}
impl FingerprintVecWriter {
pub fn new(n: usize, b: u8) -> Self {
assert!(b > 0 && b <= 64, "fingerprint width must be 1..=64");
let data_bytes = (n * b as usize + 7) / 8;
Self { buf: vec![0u8; data_bytes], n, b }
}
#[inline]
pub fn set(&mut self, slot: usize, fingerprint: u64) {
debug_assert!(slot < self.n);
let lo = slot * self.b as usize;
let bits = BitSlice::<u8, Lsb0>::from_slice_mut(&mut self.buf);
bits[lo .. lo + self.b as usize].store_le(fingerprint);
}
pub fn write(self, path: &Path) -> OLMResult<()> {
let mut f = BufWriter::new(File::create(path).map_err(OLMError::Io)?);
f.write_all(MAGIC).map_err(OLMError::Io)?;
f.write_all(&[self.b, 0, 0, 0]).map_err(OLMError::Io)?;
f.write_all(&(self.n as u64).to_le_bytes()).map_err(OLMError::Io)?;
f.write_all(&self.buf).map_err(OLMError::Io)?;
Ok(())
}
}
// ── Tests ─────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use tempfile::tempdir;
fn roundtrip(n: usize, b: u8, values: &[u64]) {
let dir = tempdir().unwrap();
let path = dir.path().join("fp.bin");
let mask: u64 = if b == 64 { u64::MAX } else { (1u64 << b) - 1 };
let mut w = FingerprintVecWriter::new(n, b);
for (i, &v) in values.iter().enumerate() {
w.set(i, v);
}
w.write(&path).unwrap();
let r = FingerprintVec::open(&path).unwrap();
assert_eq!(r.n(), n);
assert_eq!(r.b(), b);
for (i, &v) in values.iter().enumerate() {
assert_eq!(r.get(i), v & mask, "slot {i} b={b}");
}
}
#[test]
fn roundtrip_b1() { roundtrip(8, 1, &[1, 0, 1, 1, 0, 0, 1, 0]); }
#[test]
fn roundtrip_b8() { roundtrip(4, 8, &[0, 127, 200, 255]); }
#[test]
fn roundtrip_b16() { roundtrip(3, 16, &[0, 0xABCD, 0xFFFF]); }
#[test]
fn roundtrip_b7() { roundtrip(5, 7, &[0, 63, 127, 1, 42]); }
#[test]
fn roundtrip_b13_unaligned() {
let vals: Vec<u64> = (0..20).map(|i| (i * 317) % (1 << 13)).collect();
roundtrip(20, 13, &vals);
}
#[test]
fn matches_returns_true_for_exact() {
let dir = tempdir().unwrap();
let path = dir.path().join("fp.bin");
let mut w = FingerprintVecWriter::new(4, 8);
w.set(0, 42); w.set(1, 0); w.set(2, 255); w.set(3, 17);
w.write(&path).unwrap();
let r = FingerprintVec::open(&path).unwrap();
assert!( r.matches(0, 42));
assert!(!r.matches(0, 43));
assert!( r.matches(2, 255));
}
}
+8
View File
@@ -84,6 +84,14 @@ impl<D: LayerData> Layer<D> {
pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult<usize> { pub fn build_exact_evidence(layer_dir: &Path) -> OLMResult<usize> {
MphfLayer::build_exact_evidence(layer_dir) MphfLayer::build_exact_evidence(layer_dir)
} }
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already
/// present in `layer_dir`. `b` is the fingerprint width in bits (1..=64).
///
/// See [`MphfLayer::build_approx_evidence`] for the full contract.
pub fn build_approx_evidence(layer_dir: &Path, b: u8) -> OLMResult<usize> {
MphfLayer::build_approx_evidence(layer_dir, b)
}
} }
// ── Mode 1 — set membership ─────────────────────────────────────────────────── // ── Mode 1 — set membership ───────────────────────────────────────────────────
+1
View File
@@ -1,5 +1,6 @@
pub mod error; pub mod error;
pub mod evidence; pub mod evidence;
pub mod fingerprint;
pub mod layer; pub mod layer;
pub mod layered_store; pub mod layered_store;
pub mod map; pub mod map;
+48
View File
@@ -9,10 +9,12 @@ use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::error::{OLMError, OLMResult}; use crate::error::{OLMError, OLMResult};
use crate::evidence::{Evidence, EvidenceWriter}; use crate::evidence::{Evidence, EvidenceWriter};
use crate::fingerprint::{FingerprintVec, FingerprintVecWriter};
pub(crate) const MPHF_FILE: &str = "mphf.bin"; pub(crate) const MPHF_FILE: &str = "mphf.bin";
pub(crate) const UNITIGS_FILE: &str = "unitigs.bin"; pub(crate) const UNITIGS_FILE: &str = "unitigs.bin";
pub(crate) const EVIDENCE_FILE: &str = "evidence.bin"; pub(crate) const EVIDENCE_FILE: &str = "evidence.bin";
pub(crate) const FINGERPRINT_FILE: &str = "fingerprint.bin";
pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>; pub(crate) type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
@@ -50,6 +52,17 @@ impl MphfLayer {
} }
} }
/// Returns `Some(slot)` if `kmer` passes the fingerprint check, `None` otherwise.
///
/// False positive rate per query: 1/2^b (where b is the fingerprint width
/// used at build time). No `.idx` or `evidence.bin` needed at query time.
#[inline]
pub fn find_approx(&self, kmer: CanonicalKmer, fp: &FingerprintVec) -> Option<usize> {
let slot = self.mphf.index(&kmer.raw());
if slot >= self.n { return None; }
if fp.matches(slot, kmer.seq_hash()) { Some(slot) } else { None }
}
pub fn n(&self) -> usize { self.n } pub fn n(&self) -> usize { self.n }
pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> { pub fn unitig_writer(dir: &Path) -> OLMResult<UnitigFileWriter> {
@@ -108,6 +121,41 @@ impl MphfLayer {
Ok(n) Ok(n)
} }
/// Build `fingerprint.bin` from `unitigs.bin` and `mphf.bin` already present
/// in `dir`. `b` is the number of fingerprint bits (1..=64).
///
/// The fingerprint for each slot is the low `b` bits of `kmer.seq_hash()`.
/// No `.idx` file is written — approximate evidence needs no random access.
pub fn build_approx_evidence(dir: &Path, b: u8) -> OLMResult<usize> {
if b == 0 || b > 64 {
return Err(OLMError::InvalidLayer("fingerprint width must be 1..=64".into()));
}
let unitig_path = dir.join(UNITIGS_FILE);
let unitigs = UnitigFileReader::open_sequential(&unitig_path)?;
let n = unitigs.n_kmers();
if n == 0 {
FingerprintVecWriter::new(0, b).write(&dir.join(FINGERPRINT_FILE))?;
return Ok(0);
}
let mphf: Mphf = Mphf::load_full(&dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
let mut fw = FingerprintVecWriter::new(n, b);
for kmer in unitigs.iter_canonical_kmers() {
let slot = mphf.index(&kmer.raw());
if slot >= n {
return Err(OLMError::Mphf("slot out of bounds".into()));
}
fw.set(slot, kmer.seq_hash());
}
fw.write(&dir.join(FINGERPRINT_FILE))?;
Ok(n)
}
/// Build MPHF and exact evidence from the unitigs file already present in /// Build MPHF and exact evidence from the unitigs file already present in
/// `dir`. Calls `fill_slot(slot, kmer)` once per kmer for DataStore /// `dir`. Calls `fill_slot(slot, kmer)` once per kmer for DataStore
/// population. Returns the number of kmers indexed. /// population. Returns the number of kmers indexed.