Add persistent compact integer vector and cache-line-optimized MPHF

Introduce the `obicompactvec` crate, featuring a two-tier, memory-mapped integer vector that uses a primary `u8` array with a sentinel for overflow dispatch and a sparse L1-resident index for fast random access. Implement builder and reader modules with zero-copy serialization and comprehensive test coverage. Update `obilayeredmap` to replace the default hash function with a cache-line-optimized `Mphf`, adding explicit bounds checking and duplicate-slot detection. Add documentation for both modules and update project configuration files accordingly.
This commit is contained in:
Eric Coissac
2026-05-13 06:24:43 +08:00
parent 84ed752b78
commit f2de79acde
14 changed files with 710 additions and 91 deletions
+10
View File
@@ -0,0 +1,10 @@
[package]
name = "obicompactvec"
version = "0.1.0"
edition = "2024"
[dependencies]
memmap2 = "0.9"
[dev-dependencies]
tempfile = "3"
+64
View File
@@ -0,0 +1,64 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufWriter, Write as _};
use std::path::Path;
use crate::format::write_overflow;
pub struct PersistentCompactIntVecBuilder {
primary: Vec<u8>,
overflow: HashMap<u64, u32>,
}
impl PersistentCompactIntVecBuilder {
pub fn new(n: usize) -> Self {
Self {
primary: vec![0u8; n],
overflow: HashMap::new(),
}
}
pub fn set(&mut self, slot: u64, value: u32) {
if value < 255 {
self.primary[slot as usize] = value as u8;
self.overflow.remove(&slot);
} else {
self.primary[slot as usize] = 255;
self.overflow.insert(slot, value);
}
}
pub fn get(&self, slot: u64) -> u32 {
match self.primary[slot as usize] {
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
v => v as u32,
}
}
pub fn len(&self) -> usize {
self.primary.len()
}
/// Write `counts_primary.bin` and (if needed) `counts_overflow.bin`, then drop all state.
pub fn close(self, primary_path: &Path, overflow_path: &Path) -> io::Result<()> {
// Write primary array.
let mut w = BufWriter::new(File::create(primary_path)?);
w.write_all(&self.primary)?;
w.flush()?;
drop(w);
if self.overflow.is_empty() {
return Ok(());
}
// Sort overflow entries by slot.
let mut entries: Vec<(u32, u32)> = self
.overflow
.into_iter()
.map(|(slot, value)| (slot as u32, value))
.collect();
entries.sort_unstable_by_key(|&(slot, _)| slot);
write_overflow(overflow_path, &entries)
}
}
+81
View File
@@ -0,0 +1,81 @@
use std::fs::File;
use std::io::{self, BufWriter, Write as _};
use std::path::Path;
pub const MAGIC: [u8; 4] = *b"PCIV";
// L1 cache target: 32 KB / 8 bytes per index entry
pub const L1_INDEX_ENTRIES: usize = 4096;
/// Write the overflow file from a sorted list of (slot, value) pairs.
pub fn write_overflow(
path: &Path,
entries: &[(u32, u32)],
) -> io::Result<()> {
let n_overflow = entries.len() as u32;
let step: u32 = if entries.len() <= L1_INDEX_ENTRIES {
0
} else {
entries.len().div_ceil(L1_INDEX_ENTRIES) as u32
};
let mut w = BufWriter::new(File::create(path)?);
w.write_all(&MAGIC)?;
w.write_all(&n_overflow.to_le_bytes())?;
w.write_all(&step.to_le_bytes())?;
if step > 0 {
let n_index = entries.len().div_ceil(step as usize) as u32;
w.write_all(&n_index.to_le_bytes())?;
for (pos, chunk) in entries.chunks(step as usize).enumerate() {
let slot = chunk[0].0;
w.write_all(&slot.to_le_bytes())?;
w.write_all(&((pos * step as usize) as u32).to_le_bytes())?;
}
}
for &(slot, value) in entries {
w.write_all(&slot.to_le_bytes())?;
w.write_all(&value.to_le_bytes())?;
}
w.flush()
}
/// Parse the header and sparse index from the overflow file bytes.
/// Returns (n_overflow, step, index_entries, data_byte_offset).
pub fn parse_overflow_header(
data: &[u8],
) -> io::Result<(u32, u32, Vec<(u32, u32)>, usize)> {
let mut pos = 0;
let magic = read4(data, &mut pos)?;
if magic != MAGIC {
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
}
let n_overflow = u32::from_le_bytes(read4(data, &mut pos)?);
let step = u32::from_le_bytes(read4(data, &mut pos)?);
let mut index = Vec::new();
if step > 0 {
let n_index = u32::from_le_bytes(read4(data, &mut pos)?) as usize;
index.reserve(n_index);
for _ in 0..n_index {
let slot = u32::from_le_bytes(read4(data, &mut pos)?);
let ipos = u32::from_le_bytes(read4(data, &mut pos)?);
index.push((slot, ipos));
}
}
Ok((n_overflow, step, index, pos))
}
fn read4(data: &[u8], pos: &mut usize) -> io::Result<[u8; 4]> {
data.get(*pos..*pos + 4)
.and_then(|s| s.try_into().ok())
.map(|arr| { *pos += 4; arr })
.ok_or_else(|| io::Error::new(io::ErrorKind::UnexpectedEof, "truncated PCIV file"))
}
+10
View File
@@ -0,0 +1,10 @@
mod builder;
mod format;
mod reader;
pub use builder::PersistentCompactIntVecBuilder;
pub use reader::PersistentCompactIntVec;
#[cfg(test)]
#[path = "tests/mod.rs"]
mod tests;
+93
View File
@@ -0,0 +1,93 @@
use std::fs::File;
use std::io;
use std::path::Path;
use memmap2::Mmap;
use crate::format::parse_overflow_header;
pub struct PersistentCompactIntVec {
primary: Mmap,
index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index
data: Option<Mmap>, // mmap of overflow data region
n_overflow: u32,
pub step: u32,
data_offset: usize, // byte offset of data region within overflow mmap
}
impl PersistentCompactIntVec {
/// Open a previously written `PersistentCompactIntVec`.
///
/// `overflow_path` is optional: pass `None` if no overflow file was written
/// (i.e. all values were < 255).
pub fn open(primary_path: &Path, overflow_path: Option<&Path>) -> io::Result<Self> {
let primary = unsafe { Mmap::map(&File::open(primary_path)?)? };
let (data, n_overflow, step, index, data_offset) = match overflow_path {
None => (None, 0, 0, Vec::new(), 0),
Some(p) => {
let mmap = unsafe { Mmap::map(&File::open(p)?)? };
let (n_overflow, step, index, data_offset) =
parse_overflow_header(&mmap)?;
(Some(mmap), n_overflow, step, index, data_offset)
}
};
Ok(Self { primary, index, data, n_overflow, step, data_offset })
}
pub fn len(&self) -> usize {
self.primary.len()
}
pub fn is_empty(&self) -> bool {
self.primary.is_empty()
}
pub fn get(&self, slot: u64) -> u32 {
match self.primary[slot as usize] {
255 => self.overflow_get(slot as u32),
v => v as u32,
}
}
fn overflow_get(&self, slot: u32) -> u32 {
let data = self.data.as_ref().expect("sentinel without overflow file");
let raw = &data[self.data_offset..];
let pos_start;
let pos_end;
if self.step == 0 {
pos_start = 0;
pos_end = self.n_overflow as usize;
} else {
// Binary search in the L1-resident sparse index.
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
pos_start = self.index[i].1 as usize;
pos_end = if i + 1 < self.index.len() {
self.index[i + 1].1 as usize
} else {
self.n_overflow as usize
};
}
// Binary search within the block.
let block = Self::data_slice(raw, pos_start, pos_end);
match block.binary_search_by_key(&slot, |&(s, _)| s) {
Ok(i) => block[i].1,
Err(_) => panic!("slot {slot} marked as overflow but not found in data"),
}
}
/// Interpret a byte slice as a slice of (u32, u32) pairs without copying.
fn data_slice(raw: &[u8], from: usize, to: usize) -> &[(u32, u32)] {
let byte_start = from * 8;
let byte_end = to * 8;
let bytes = &raw[byte_start..byte_end];
// Safety: the file was written as LE u32 pairs; alignment is guaranteed
// because we read from a byte slice and cast explicitly.
let ptr = bytes.as_ptr() as *const (u32, u32);
unsafe { std::slice::from_raw_parts(ptr, to - from) }
}
}
+119
View File
@@ -0,0 +1,119 @@
use tempfile::tempdir;
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
fn primary(dir: &std::path::Path) -> std::path::PathBuf { dir.join("primary.bin") }
fn overflow(dir: &std::path::Path) -> std::path::PathBuf { dir.join("overflow.bin") }
fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec<u32> {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(n);
for &(slot, v) in values {
b.set(slot, v);
}
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
let ov_path = ov.exists().then_some(ov.as_path());
let r = PersistentCompactIntVec::open(&primary(dir.path()), ov_path).unwrap();
(0..n as u64).map(|s| r.get(s)).collect()
}
#[test]
fn all_zero_by_default() {
let got = roundtrip(&[], 8);
assert!(got.iter().all(|&v| v == 0));
}
#[test]
fn small_values_no_overflow_file() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(4);
b.set(0, 1);
b.set(1, 254);
b.set(2, 0);
b.set(3, 100);
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
assert!(!ov.exists(), "no overflow file expected");
let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap();
assert_eq!(r.get(0), 1);
assert_eq!(r.get(1), 254);
assert_eq!(r.get(2), 0);
assert_eq!(r.get(3), 100);
}
#[test]
fn overflow_values_roundtrip() {
let values = [(0, 255), (1, 1000), (2, 50), (3, 1_313_691)];
let got = roundtrip(&values, 4);
assert_eq!(got[0], 255);
assert_eq!(got[1], 1000);
assert_eq!(got[2], 50);
assert_eq!(got[3], 1_313_691);
}
#[test]
fn mutation_downward_removes_from_overflow() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(2);
b.set(0, 1000); // goes to overflow
b.set(0, 42); // comes back below threshold
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
assert!(!ov.exists(), "no overflow file expected after downward mutation");
let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap();
assert_eq!(r.get(0), 42);
}
#[test]
fn mutation_upward_updates_overflow() {
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(1);
b.set(0, 300);
b.set(0, 500);
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap();
assert_eq!(r.get(0), 500);
}
#[test]
fn sparse_index_built_for_many_overflows() {
// Generate more than L1_INDEX_ENTRIES (4096) overflow entries to trigger the sparse index.
let n = 5000usize;
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(n);
for i in 0..n {
b.set(i as u64, 1000 + i as u32); // all ≥ 255
}
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
assert!(ov.exists());
let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap();
assert!(r.step > 0, "sparse index should have been built");
for i in 0..n {
assert_eq!(r.get(i as u64), 1000 + i as u32, "mismatch at slot {i}");
}
}
#[test]
fn mixed_large_dataset() {
// Mirrors realistic distribution: most values small, sparse overflows.
let n = 1000usize;
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntVecBuilder::new(n);
for i in 0..n {
let v = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
b.set(i as u64, v);
}
let ov = overflow(dir.path());
b.close(&primary(dir.path()), &ov).unwrap();
let r = PersistentCompactIntVec::open(&primary(dir.path()), ov.exists().then_some(ov.as_path())).unwrap();
for i in 0..n {
let expected = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
assert_eq!(r.get(i as u64), expected, "slot {i}");
}
}