Add persistent compact integer vector and cache-line-optimized MPHF
Introduce the `obicompactvec` crate, featuring a two-tier, memory-mapped integer vector that uses a primary `u8` array with a sentinel for overflow dispatch and a sparse L1-resident index for fast random access. Implement builder and reader modules with zero-copy serialization and comprehensive test coverage. Update `obilayeredmap` to replace the default hash function with a cache-line-optimized `Mphf`, adding explicit bounds checking and duplicate-slot detection. Add documentation for both modules and update project configuration files accordingly.
This commit is contained in:
@@ -4,14 +4,15 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
ptr_hash = "1.1"
|
||||
epserde = "0.8"
|
||||
rayon = "1"
|
||||
memmap2 = "0.9"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
obikseq = { path = "../obikseq" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
ptr_hash = "1.1"
|
||||
cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
rayon = "1"
|
||||
memmap2 = "0.9"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
@@ -2,10 +2,11 @@ use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
use cacheline_ef::{CachelineEf, CachelineEfVec};
|
||||
use epserde::prelude::*;
|
||||
use obikseq::CanonicalKmer;
|
||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||
use ptr_hash::{DefaultPtrHash, PtrHashParams};
|
||||
use ptr_hash::{PtrHash, PtrHashParams, bucket_fn::CubicEps, hash::Xx64};
|
||||
|
||||
use crate::counts::{Counts, CountsWriter};
|
||||
use crate::error::{OLMError, OLMResult};
|
||||
@@ -16,8 +17,10 @@ const UNITIGS_FILE: &str = "unitigs.bin";
|
||||
const EVIDENCE_FILE: &str = "evidence.bin";
|
||||
const COUNTS_FILE: &str = "counts.bin";
|
||||
|
||||
type Mphf = PtrHash<u64, CubicEps, CachelineEfVec<Vec<CachelineEf>>, Xx64, Vec<u8>>;
|
||||
|
||||
pub struct Layer {
|
||||
mphf: DefaultPtrHash,
|
||||
mphf: Mphf,
|
||||
evidence: Evidence,
|
||||
unitigs: UnitigFileReader,
|
||||
counts: Counts,
|
||||
@@ -30,19 +33,14 @@ pub struct Hit {
|
||||
|
||||
impl Layer {
|
||||
pub fn open(path: &Path) -> OLMResult<Self> {
|
||||
let mphf: DefaultPtrHash = DefaultPtrHash::load_full(&path.join(MPHF_FILE))
|
||||
let mphf: Mphf = Mphf::load_full(&path.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
let unitigs = UnitigFileReader::open(&path.join(UNITIGS_FILE))?;
|
||||
let evidence = Evidence::open(&path.join(EVIDENCE_FILE))?;
|
||||
let counts = Counts::open(&path.join(COUNTS_FILE))?;
|
||||
|
||||
Ok(Self {
|
||||
mphf,
|
||||
evidence,
|
||||
unitigs,
|
||||
counts,
|
||||
})
|
||||
Ok(Self { mphf, evidence, unitigs, counts })
|
||||
}
|
||||
|
||||
pub fn query(&self, kmer: CanonicalKmer) -> Option<Hit> {
|
||||
@@ -52,10 +50,7 @@ impl Layer {
|
||||
.unitigs
|
||||
.verify_canonical_kmer(chunk_id as usize, rank as usize, kmer)
|
||||
{
|
||||
Some(Hit {
|
||||
slot,
|
||||
count: self.counts.get(slot),
|
||||
})
|
||||
Some(Hit { slot, count: self.counts.get(slot) })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -74,28 +69,39 @@ impl Layer {
|
||||
if n == 0 {
|
||||
fs::File::create(out_dir.join(EVIDENCE_FILE))?;
|
||||
fs::File::create(out_dir.join(COUNTS_FILE))?;
|
||||
let mphf: DefaultPtrHash = DefaultPtrHash::new(&[] as &[u64], PtrHashParams::default());
|
||||
let mphf: Mphf = Mphf::try_new(&[] as &[u64], PtrHashParams::<CubicEps>::default())
|
||||
.ok_or_else(|| OLMError::Mphf("construction failed".into()))?;
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
// Build MPHF from a cloneable parallel iterator — no Vec<u64> allocation.
|
||||
// First pass: build the MPHF from a cloneable parallel iterator.
|
||||
// flat_map_iter: outer chunks in parallel, inner kmer sliding-window sequential.
|
||||
let keys = (0..unitigs.len())
|
||||
.into_par_iter()
|
||||
.flat_map_iter(|ci| unitigs.unitig(ci).into_canonical_kmers().map(|km| km.raw()));
|
||||
let mphf: DefaultPtrHash =
|
||||
DefaultPtrHash::new_from_par_iter(n, keys, PtrHashParams::default());
|
||||
let mphf: Mphf = Mphf::new_from_par_iter(n, keys, PtrHashParams::<CubicEps>::default());
|
||||
mphf.store(&out_dir.join(MPHF_FILE))
|
||||
.map_err(|e| OLMError::InvalidLayer(e.to_string()))?;
|
||||
|
||||
// Second pass: fill evidence and counts
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut cnt = CountsWriter::new(n);
|
||||
// Second pass: fill evidence and counts; verify MPHF correctness inline.
|
||||
// seen is a compact bitset (n/8 bytes) — no extra iteration needed.
|
||||
let mut ev = EvidenceWriter::new(n);
|
||||
let mut cnt = CountsWriter::new(n);
|
||||
let mut seen = vec![0u8; (n + 7) / 8];
|
||||
|
||||
for (kmer, chunk_id, rank) in unitigs.iter_indexed_canonical_kmers() {
|
||||
let slot = mphf.index(&kmer.raw());
|
||||
if slot >= n {
|
||||
return Err(OLMError::Mphf("MPHF construction failed: slot out of bounds".into()));
|
||||
}
|
||||
let byte = slot / 8;
|
||||
let bit = 1u8 << (slot % 8);
|
||||
if seen[byte] & bit != 0 {
|
||||
return Err(OLMError::Mphf("MPHF construction failed: duplicate slot".into()));
|
||||
}
|
||||
seen[byte] |= bit;
|
||||
ev.set(slot, chunk_id as u32, rank as u8);
|
||||
cnt.set(slot, count_of(kmer));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user