Add persistent compact integer vector and cache-line-optimized MPHF
Introduce the `obicompactvec` crate, featuring a two-tier, memory-mapped integer vector that uses a primary `u8` array with a sentinel for overflow dispatch and a sparse L1-resident index for fast random access. Implement builder and reader modules with zero-copy serialization and comprehensive test coverage. Update `obilayeredmap` to replace the default hash function with a cache-line-optimized `Mphf`, adding explicit bounds checking and duplicate-slot detection. Add documentation for both modules and update project configuration files accordingly.
This commit is contained in:
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "obicompactvec"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
memmap2 = "0.9"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
@@ -0,0 +1,64 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter, Write as _};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::format::write_overflow;
|
||||
|
||||
pub struct PersistentCompactIntVecBuilder {
|
||||
primary: Vec<u8>,
|
||||
overflow: HashMap<u64, u32>,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVecBuilder {
|
||||
pub fn new(n: usize) -> Self {
|
||||
Self {
|
||||
primary: vec![0u8; n],
|
||||
overflow: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, slot: u64, value: u32) {
|
||||
if value < 255 {
|
||||
self.primary[slot as usize] = value as u8;
|
||||
self.overflow.remove(&slot);
|
||||
} else {
|
||||
self.primary[slot as usize] = 255;
|
||||
self.overflow.insert(slot, value);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, slot: u64) -> u32 {
|
||||
match self.primary[slot as usize] {
|
||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.primary.len()
|
||||
}
|
||||
|
||||
/// Write `counts_primary.bin` and (if needed) `counts_overflow.bin`, then drop all state.
|
||||
pub fn close(self, primary_path: &Path, overflow_path: &Path) -> io::Result<()> {
|
||||
// Write primary array.
|
||||
let mut w = BufWriter::new(File::create(primary_path)?);
|
||||
w.write_all(&self.primary)?;
|
||||
w.flush()?;
|
||||
drop(w);
|
||||
|
||||
if self.overflow.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Sort overflow entries by slot.
|
||||
let mut entries: Vec<(u32, u32)> = self
|
||||
.overflow
|
||||
.into_iter()
|
||||
.map(|(slot, value)| (slot as u32, value))
|
||||
.collect();
|
||||
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
||||
|
||||
write_overflow(overflow_path, &entries)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter, Write as _};
|
||||
use std::path::Path;
|
||||
|
||||
pub const MAGIC: [u8; 4] = *b"PCIV";
|
||||
|
||||
// L1 cache target: 32 KB / 8 bytes per index entry
|
||||
pub const L1_INDEX_ENTRIES: usize = 4096;
|
||||
|
||||
/// Write the overflow file from a sorted list of (slot, value) pairs.
|
||||
pub fn write_overflow(
|
||||
path: &Path,
|
||||
entries: &[(u32, u32)],
|
||||
) -> io::Result<()> {
|
||||
let n_overflow = entries.len() as u32;
|
||||
|
||||
let step: u32 = if entries.len() <= L1_INDEX_ENTRIES {
|
||||
0
|
||||
} else {
|
||||
entries.len().div_ceil(L1_INDEX_ENTRIES) as u32
|
||||
};
|
||||
|
||||
let mut w = BufWriter::new(File::create(path)?);
|
||||
|
||||
w.write_all(&MAGIC)?;
|
||||
w.write_all(&n_overflow.to_le_bytes())?;
|
||||
w.write_all(&step.to_le_bytes())?;
|
||||
|
||||
if step > 0 {
|
||||
let n_index = entries.len().div_ceil(step as usize) as u32;
|
||||
w.write_all(&n_index.to_le_bytes())?;
|
||||
for (pos, chunk) in entries.chunks(step as usize).enumerate() {
|
||||
let slot = chunk[0].0;
|
||||
w.write_all(&slot.to_le_bytes())?;
|
||||
w.write_all(&((pos * step as usize) as u32).to_le_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
for &(slot, value) in entries {
|
||||
w.write_all(&slot.to_le_bytes())?;
|
||||
w.write_all(&value.to_le_bytes())?;
|
||||
}
|
||||
|
||||
w.flush()
|
||||
}
|
||||
|
||||
/// Parse the header and sparse index from the overflow file bytes.
|
||||
/// Returns (n_overflow, step, index_entries, data_byte_offset).
|
||||
pub fn parse_overflow_header(
|
||||
data: &[u8],
|
||||
) -> io::Result<(u32, u32, Vec<(u32, u32)>, usize)> {
|
||||
let mut pos = 0;
|
||||
|
||||
let magic = read4(data, &mut pos)?;
|
||||
if magic != MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||
}
|
||||
|
||||
let n_overflow = u32::from_le_bytes(read4(data, &mut pos)?);
|
||||
let step = u32::from_le_bytes(read4(data, &mut pos)?);
|
||||
|
||||
let mut index = Vec::new();
|
||||
if step > 0 {
|
||||
let n_index = u32::from_le_bytes(read4(data, &mut pos)?) as usize;
|
||||
index.reserve(n_index);
|
||||
for _ in 0..n_index {
|
||||
let slot = u32::from_le_bytes(read4(data, &mut pos)?);
|
||||
let ipos = u32::from_le_bytes(read4(data, &mut pos)?);
|
||||
index.push((slot, ipos));
|
||||
}
|
||||
}
|
||||
|
||||
Ok((n_overflow, step, index, pos))
|
||||
}
|
||||
|
||||
fn read4(data: &[u8], pos: &mut usize) -> io::Result<[u8; 4]> {
|
||||
data.get(*pos..*pos + 4)
|
||||
.and_then(|s| s.try_into().ok())
|
||||
.map(|arr| { *pos += 4; arr })
|
||||
.ok_or_else(|| io::Error::new(io::ErrorKind::UnexpectedEof, "truncated PCIV file"))
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
mod builder;
|
||||
mod format;
|
||||
mod reader;
|
||||
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/mod.rs"]
|
||||
mod tests;
|
||||
@@ -0,0 +1,93 @@
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::format::parse_overflow_header;
|
||||
|
||||
pub struct PersistentCompactIntVec {
|
||||
primary: Mmap,
|
||||
index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index
|
||||
data: Option<Mmap>, // mmap of overflow data region
|
||||
n_overflow: u32,
|
||||
pub step: u32,
|
||||
data_offset: usize, // byte offset of data region within overflow mmap
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVec {
|
||||
/// Open a previously written `PersistentCompactIntVec`.
|
||||
///
|
||||
/// `overflow_path` is optional: pass `None` if no overflow file was written
|
||||
/// (i.e. all values were < 255).
|
||||
pub fn open(primary_path: &Path, overflow_path: Option<&Path>) -> io::Result<Self> {
|
||||
let primary = unsafe { Mmap::map(&File::open(primary_path)?)? };
|
||||
|
||||
let (data, n_overflow, step, index, data_offset) = match overflow_path {
|
||||
None => (None, 0, 0, Vec::new(), 0),
|
||||
Some(p) => {
|
||||
let mmap = unsafe { Mmap::map(&File::open(p)?)? };
|
||||
let (n_overflow, step, index, data_offset) =
|
||||
parse_overflow_header(&mmap)?;
|
||||
(Some(mmap), n_overflow, step, index, data_offset)
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { primary, index, data, n_overflow, step, data_offset })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.primary.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.primary.is_empty()
|
||||
}
|
||||
|
||||
pub fn get(&self, slot: u64) -> u32 {
|
||||
match self.primary[slot as usize] {
|
||||
255 => self.overflow_get(slot as u32),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
fn overflow_get(&self, slot: u32) -> u32 {
|
||||
let data = self.data.as_ref().expect("sentinel without overflow file");
|
||||
let raw = &data[self.data_offset..];
|
||||
|
||||
let pos_start;
|
||||
let pos_end;
|
||||
|
||||
if self.step == 0 {
|
||||
pos_start = 0;
|
||||
pos_end = self.n_overflow as usize;
|
||||
} else {
|
||||
// Binary search in the L1-resident sparse index.
|
||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
pos_start = self.index[i].1 as usize;
|
||||
pos_end = if i + 1 < self.index.len() {
|
||||
self.index[i + 1].1 as usize
|
||||
} else {
|
||||
self.n_overflow as usize
|
||||
};
|
||||
}
|
||||
|
||||
// Binary search within the block.
|
||||
let block = Self::data_slice(raw, pos_start, pos_end);
|
||||
match block.binary_search_by_key(&slot, |&(s, _)| s) {
|
||||
Ok(i) => block[i].1,
|
||||
Err(_) => panic!("slot {slot} marked as overflow but not found in data"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Interpret a byte slice as a slice of (u32, u32) pairs without copying.
|
||||
fn data_slice(raw: &[u8], from: usize, to: usize) -> &[(u32, u32)] {
|
||||
let byte_start = from * 8;
|
||||
let byte_end = to * 8;
|
||||
let bytes = &raw[byte_start..byte_end];
|
||||
// Safety: the file was written as LE u32 pairs; alignment is guaranteed
|
||||
// because we read from a byte slice and cast explicitly.
|
||||
let ptr = bytes.as_ptr() as *const (u32, u32);
|
||||
unsafe { std::slice::from_raw_parts(ptr, to - from) }
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
|
||||
fn primary(dir: &std::path::Path) -> std::path::PathBuf { dir.join("primary.bin") }
|
||||
fn overflow(dir: &std::path::Path) -> std::path::PathBuf { dir.join("overflow.bin") }
|
||||
|
||||
fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec<u32> {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n);
|
||||
for &(slot, v) in values {
|
||||
b.set(slot, v);
|
||||
}
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
|
||||
let ov_path = ov.exists().then_some(ov.as_path());
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), ov_path).unwrap();
|
||||
(0..n as u64).map(|s| r.get(s)).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_zero_by_default() {
|
||||
let got = roundtrip(&[], 8);
|
||||
assert!(got.iter().all(|&v| v == 0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn small_values_no_overflow_file() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(4);
|
||||
b.set(0, 1);
|
||||
b.set(1, 254);
|
||||
b.set(2, 0);
|
||||
b.set(3, 100);
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
assert!(!ov.exists(), "no overflow file expected");
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap();
|
||||
assert_eq!(r.get(0), 1);
|
||||
assert_eq!(r.get(1), 254);
|
||||
assert_eq!(r.get(2), 0);
|
||||
assert_eq!(r.get(3), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn overflow_values_roundtrip() {
|
||||
let values = [(0, 255), (1, 1000), (2, 50), (3, 1_313_691)];
|
||||
let got = roundtrip(&values, 4);
|
||||
assert_eq!(got[0], 255);
|
||||
assert_eq!(got[1], 1000);
|
||||
assert_eq!(got[2], 50);
|
||||
assert_eq!(got[3], 1_313_691);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mutation_downward_removes_from_overflow() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(2);
|
||||
b.set(0, 1000); // goes to overflow
|
||||
b.set(0, 42); // comes back below threshold
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
assert!(!ov.exists(), "no overflow file expected after downward mutation");
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap();
|
||||
assert_eq!(r.get(0), 42);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mutation_upward_updates_overflow() {
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(1);
|
||||
b.set(0, 300);
|
||||
b.set(0, 500);
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap();
|
||||
assert_eq!(r.get(0), 500);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse_index_built_for_many_overflows() {
|
||||
// Generate more than L1_INDEX_ENTRIES (4096) overflow entries to trigger the sparse index.
|
||||
let n = 5000usize;
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n);
|
||||
for i in 0..n {
|
||||
b.set(i as u64, 1000 + i as u32); // all ≥ 255
|
||||
}
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
assert!(ov.exists());
|
||||
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap();
|
||||
assert!(r.step > 0, "sparse index should have been built");
|
||||
for i in 0..n {
|
||||
assert_eq!(r.get(i as u64), 1000 + i as u32, "mismatch at slot {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_large_dataset() {
|
||||
// Mirrors realistic distribution: most values small, sparse overflows.
|
||||
let n = 1000usize;
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n);
|
||||
for i in 0..n {
|
||||
let v = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
|
||||
b.set(i as u64, v);
|
||||
}
|
||||
let ov = overflow(dir.path());
|
||||
b.close(&primary(dir.path()), &ov).unwrap();
|
||||
|
||||
let r = PersistentCompactIntVec::open(&primary(dir.path()), ov.exists().then_some(ov.as_path())).unwrap();
|
||||
for i in 0..n {
|
||||
let expected = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
|
||||
assert_eq!(r.get(i as u64), expected, "slot {i}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user