Add persistent compact integer vector and cache-line-optimized MPHF

Introduce the `obicompactvec` crate, featuring a two-tier, memory-mapped integer vector that uses a primary `u8` array with a sentinel for overflow dispatch and a sparse L1-resident index for fast random access. Implement builder and reader modules with zero-copy serialization and comprehensive test coverage. Update `obilayeredmap` to replace the default hash function with a cache-line-optimized `Mphf`, adding explicit bounds checking and duplicate-slot detection. Add documentation for both modules and update project configuration files accordingly.
This commit is contained in:
Eric Coissac
2026-05-13 06:24:43 +08:00
parent 84ed752b78
commit f2de79acde
14 changed files with 710 additions and 91 deletions
+9 -8
View File
@@ -4,14 +4,15 @@ version = "0.1.0"
edition = "2024"
[dependencies]
obikseq = { path = "../obikseq" }
obiskio = { path = "../obiskio" }
ptr_hash = "1.1"
epserde = "0.8"
rayon = "1"
memmap2 = "0.9"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
obikseq = { path = "../obikseq" }
obiskio = { path = "../obiskio" }
ptr_hash = "1.1"
cacheline-ef = "1.1"
epserde = "0.8"
rayon = "1"
memmap2 = "0.9"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
[dev-dependencies]
tempfile = "3"
+26 -20
View File
@@ -2,10 +2,11 @@ use std::collections::HashMap;
use std::fs;
use std::path::Path;
use cacheline_ef::{CachelineEf, CachelineEfVec};
use epserde::prelude::*;
use obikseq::CanonicalKmer;
use obiskio::{UnitigFileReader, UnitigFileWriter};
use ptr_hash::{DefaultPtrHash, PtrHashParams};
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
use crate::counts::{Counts, CountsWriter};
use crate::error::{OLMError, OLMResult};
@@ -16,8 +17,10 @@ const UNITIGS_FILE: &str = "unitigs.bin";
const EVIDENCE_FILE: &str = "evidence.bin";
const COUNTS_FILE: &str = "counts.bin";
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
pub struct Layer {
mphf: DefaultPtrHash,
mphf: Mphf,
evidence: Evidence,
unitigs: UnitigFileReader,
counts: Counts,
@@ -30,19 +33,14 @@ pub struct Hit {
impl Layer {
pub fn open(path: &Path) -> OLMResult<Self> {
let mphf: DefaultPtrHash = DefaultPtrHash::load_full(&path.join(MPHF_FILE))
let mphf: Mphf = Mphf::load_full(&path.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
let counts = Counts::open(&path.join(COUNTS_FILE))?;
Ok(Self {
mphf,
evidence,
unitigs,
counts,
})
Ok(Self { mphf, evidence, unitigs, counts })
}
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
@@ -52,10 +50,7 @@ impl Layer {
.unitigs
.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer)
{
Some(Hit {
slot,
count: self.counts.get(slot),
})
Some(Hit { slot, count: self.counts.get(slot) })
} else {
None
}
@@ -74,28 +69,39 @@ impl Layer {
if n == 0 {
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
fs::File::create(out_dir.join(COUNTS_FILE))?;
let mphf: DefaultPtrHash = DefaultPtrHash::new(&[] as &[u64], PtrHashParams::default());
let mphf: Mphf = Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
mphf.store(&out_dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
return Ok(0);
}
// Build MPHF from a cloneable parallel iterator — no Vec<u64> allocation.
// First pass: build the MPHF from a cloneable parallel iterator.
// flat_map_iter: outer chunks in parallel, inner kmer sliding-window sequential.
let keys = (0..unitigs.len())
.into_par_iter()
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
let mphf: DefaultPtrHash =
DefaultPtrHash::new_from_par_iter(n, keys, PtrHashParams::default());
let mphf: Mphf = Mphf::new_from_par_iter(n, keys, PtrHashParams::<CubicEps>::default());
mphf.store(&out_dir.join(MPHF_FILE))
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
// Second pass: fill evidence and counts
let mut ev = EvidenceWriter::new(n);
let mut cnt = CountsWriter::new(n);
// Second pass: fill evidence and counts; verify MPHF correctness inline.
// seen is a compact bitset (n/8 bytes) — no extra iteration needed.
let mut ev = EvidenceWriter::new(n);
let mut cnt = CountsWriter::new(n);
let mut seen = vec![0u8; (n + 7) / 8];
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
let slot = mphf.index(&kmer.raw());
if slot >= n {
return Err(OLMError::Mphf("MPHF construction failed: slot out of bounds".into()));
}
let byte = slot / 8;
let bit = 1u8 << (slot % 8);
if seen[byte] & bit != 0 {
return Err(OLMError::Mphf("MPHF construction failed: duplicate slot".into()));
}
seen[byte] |= bit;
ev.set(slot, chunk_id as u32, rank as u8);
cnt.set(slot, count_of(kmer));
}