From 4d5fcd4340976c542de2a4205de6291d7398d760 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 13 May 2026 10:20:15 +0800 Subject: [PATCH] refactor: split obicompactvec storage into primary and overflow files Refactors the storage format to separate primary and overflow data into distinct files. Introduces a cache-friendly sparse index with dynamically computed step and entry counts. Consolidates dual memory-mapped regions into a single file with explicit header parsing and validation, replacing unsafe slice casting with direct byte-offset indexing. Updates the test suite to accommodate the new file structure. --- src/obicompactvec/src/builder.rs | 24 +++---- src/obicompactvec/src/format.rs | 105 ++++++++++++---------------- src/obicompactvec/src/reader.rs | 108 ++++++++++++++++------------- src/obicompactvec/src/tests/mod.rs | 73 +++++++++---------- 4 files changed, 145 insertions(+), 165 deletions(-) diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index ed3a1a8..0e957b5 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -1,9 +1,8 @@ use std::collections::HashMap; -use std::fs::File; -use std::io::{self, BufWriter, Write as _}; +use std::io; use std::path::Path; -use crate::format::write_overflow; +use crate::format::write_pciv; pub struct PersistentCompactIntVecBuilder { primary: Vec, @@ -39,19 +38,12 @@ impl PersistentCompactIntVecBuilder { self.primary.len() } - /// Write `counts_primary.bin` and (if needed) `counts_overflow.bin`, then drop all state. - pub fn close(self, primary_path: &Path, overflow_path: &Path) -> io::Result<()> { - // Write primary array. - let mut w = BufWriter::new(File::create(primary_path)?); - w.write_all(&self.primary)?; - w.flush()?; - drop(w); + pub fn is_empty(&self) -> bool { + self.primary.is_empty() + } - if self.overflow.is_empty() { - return Ok(()); - } - - // Sort overflow entries by slot. + /// Finalise and write a single PCIV file, then drop all in-memory state. + pub fn close(self, path: &Path) -> io::Result<()> { let mut entries: Vec<(u32, u32)> = self .overflow .into_iter() @@ -59,6 +51,6 @@ impl PersistentCompactIntVecBuilder { .collect(); entries.sort_unstable_by_key(|&(slot, _)| slot); - write_overflow(overflow_path, &entries) + write_pciv(path, &self.primary, &entries) } } diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs index ec64bcb..8cdcc36 100644 --- a/src/obicompactvec/src/format.rs +++ b/src/obicompactvec/src/format.rs @@ -1,81 +1,66 @@ use std::fs::File; -use std::io::{self, BufWriter, Write as _}; +use std::io::{self, BufWriter, Seek, SeekFrom, Write as _}; use std::path::Path; pub const MAGIC: [u8; 4] = *b"PCIV"; +pub const HEADER_SIZE: usize = 24; // magic(4) + n(8) + n_overflow(4) + step(4) + n_index(4) -// L1 cache target: 32 KB / 8 bytes per index entry +// Sparse index target: fits in 32 KB L1 cache / 8 bytes per entry. pub const L1_INDEX_ENTRIES: usize = 4096; -/// Write the overflow file from a sorted list of (slot, value) pairs. -pub fn write_overflow( - path: &Path, - entries: &[(u32, u32)], -) -> io::Result<()> { - let n_overflow = entries.len() as u32; - - let step: u32 = if entries.len() <= L1_INDEX_ENTRIES { - 0 - } else { - entries.len().div_ceil(L1_INDEX_ENTRIES) as u32 - }; +/// Write a single PCIV file. +/// +/// Layout (write order = computation order): +/// header 24 B placeholder, overwritten at the end +/// primary n B u8 per slot (255 = overflow sentinel) +/// data n_overflow × 8 B (slot: u32, value: u32) sorted by slot +/// index n_index × 8 B (slot: u32, pos: u32) sparse index +pub fn write_pciv(path: &Path, primary: &[u8], entries: &[(u32, u32)]) -> io::Result<()> { + let n = primary.len(); + let n_overflow = entries.len(); let mut w = BufWriter::new(File::create(path)?); - w.write_all(&MAGIC)?; - w.write_all(&n_overflow.to_le_bytes())?; - w.write_all(&step.to_le_bytes())?; + // Placeholder header — will be overwritten at the end. + w.write_all(&[0u8; HEADER_SIZE])?; - if step > 0 { - let n_index = entries.len().div_ceil(step as usize) as u32; - w.write_all(&n_index.to_le_bytes())?; - for (pos, chunk) in entries.chunks(step as usize).enumerate() { - let slot = chunk[0].0; - w.write_all(&slot.to_le_bytes())?; - w.write_all(&((pos * step as usize) as u32).to_le_bytes())?; - } - } + // Primary array. + w.write_all(primary)?; + // Overflow data (sorted by slot). for &(slot, value) in entries { w.write_all(&slot.to_le_bytes())?; w.write_all(&value.to_le_bytes())?; } - w.flush() -} + // Sparse index (computed now that n_overflow is known). + let step: u32 = if n_overflow <= L1_INDEX_ENTRIES { + 0 + } else { + n_overflow.div_ceil(L1_INDEX_ENTRIES) as u32 + }; -/// Parse the header and sparse index from the overflow file bytes. -/// Returns (n_overflow, step, index_entries, data_byte_offset). -pub fn parse_overflow_header( - data: &[u8], -) -> io::Result<(u32, u32, Vec<(u32, u32)>, usize)> { - let mut pos = 0; - - let magic = read4(data, &mut pos)?; - if magic != MAGIC { - return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); - } - - let n_overflow = u32::from_le_bytes(read4(data, &mut pos)?); - let step = u32::from_le_bytes(read4(data, &mut pos)?); - - let mut index = Vec::new(); - if step > 0 { - let n_index = u32::from_le_bytes(read4(data, &mut pos)?) as usize; - index.reserve(n_index); - for _ in 0..n_index { - let slot = u32::from_le_bytes(read4(data, &mut pos)?); - let ipos = u32::from_le_bytes(read4(data, &mut pos)?); - index.push((slot, ipos)); + let n_index: u32 = if step > 0 { + let count = n_overflow.div_ceil(step as usize) as u32; + for (block, chunk) in entries.chunks(step as usize).enumerate() { + let slot = chunk[0].0; + let pos = (block * step as usize) as u32; + w.write_all(&slot.to_le_bytes())?; + w.write_all(&pos.to_le_bytes())?; } - } + count + } else { + 0 + }; - Ok((n_overflow, step, index, pos)) -} - -fn read4(data: &[u8], pos: &mut usize) -> io::Result<[u8; 4]> { - data.get(*pos..*pos + 4) - .and_then(|s| s.try_into().ok()) - .map(|arr| { *pos += 4; arr }) - .ok_or_else(|| io::Error::new(io::ErrorKind::UnexpectedEof, "truncated PCIV file")) + // Flush, then seek back to write the real header. + w.flush()?; + let mut file = w.into_inner().map_err(|e| e.into_error())?; + file.seek(SeekFrom::Start(0))?; + file.write_all(&MAGIC)?; + file.write_all(&(n as u64).to_le_bytes())?; + file.write_all(&(n_overflow as u32).to_le_bytes())?; + file.write_all(&step.to_le_bytes())?; + file.write_all(&n_index.to_le_bytes())?; + file.flush() } diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index 02a6e94..3a8bd56 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -4,90 +4,104 @@ use std::path::Path; use memmap2::Mmap; -use crate::format::parse_overflow_header; +use crate::format::{HEADER_SIZE, MAGIC}; pub struct PersistentCompactIntVec { - primary: Mmap, - index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index - data: Option, // mmap of overflow data region - n_overflow: u32, - pub step: u32, - data_offset: usize, // byte offset of data region within overflow mmap + mmap: Mmap, + n: usize, + n_overflow: usize, + pub step: u32, + index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index + primary_offset: usize, // = HEADER_SIZE + data_offset: usize, // = HEADER_SIZE + n } impl PersistentCompactIntVec { - /// Open a previously written `PersistentCompactIntVec`. - /// - /// `overflow_path` is optional: pass `None` if no overflow file was written - /// (i.e. all values were < 255). - pub fn open(primary_path: &Path, overflow_path: Option<&Path>) -> io::Result { - let primary = unsafe { Mmap::map(&File::open(primary_path)?)? }; + pub fn open(path: &Path) -> io::Result { + let mmap = unsafe { Mmap::map(&File::open(path)?)? }; - let (data, n_overflow, step, index, data_offset) = match overflow_path { - None => (None, 0, 0, Vec::new(), 0), - Some(p) => { - let mmap = unsafe { Mmap::map(&File::open(p)?)? }; - let (n_overflow, step, index, data_offset) = - parse_overflow_header(&mmap)?; - (Some(mmap), n_overflow, step, index, data_offset) - } - }; + if mmap.len() < HEADER_SIZE { + return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short")); + } + if &mmap[0..4] != &MAGIC { + return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); + } - Ok(Self { primary, index, data, n_overflow, step, data_offset }) + let n = u64::from_le_bytes(mmap[4..12].try_into().unwrap()) as usize; + let n_overflow = u32::from_le_bytes(mmap[12..16].try_into().unwrap()) as usize; + let step = u32::from_le_bytes(mmap[16..20].try_into().unwrap()); + let n_index = u32::from_le_bytes(mmap[20..24].try_into().unwrap()) as usize; + + let primary_offset = HEADER_SIZE; + let data_offset = primary_offset + n; + let index_offset = data_offset + n_overflow * 8; + + // Load the sparse index into RAM (L1-resident at query time). + let mut index = Vec::with_capacity(n_index); + for i in 0..n_index { + let off = index_offset + i * 8; + let slot = u32::from_le_bytes(mmap[off..off + 4].try_into().unwrap()); + let pos = u32::from_le_bytes(mmap[off + 4..off + 8].try_into().unwrap()); + index.push((slot, pos)); + } + + Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset }) } pub fn len(&self) -> usize { - self.primary.len() + self.n } pub fn is_empty(&self) -> bool { - self.primary.is_empty() + self.n == 0 } pub fn get(&self, slot: u64) -> u32 { - match self.primary[slot as usize] { + match self.mmap[self.primary_offset + slot as usize] { 255 => self.overflow_get(slot as u32), v => v as u32, } } fn overflow_get(&self, slot: u32) -> u32 { - let data = self.data.as_ref().expect("sentinel without overflow file"); - let raw = &data[self.data_offset..]; - let pos_start; let pos_end; if self.step == 0 { pos_start = 0; - pos_end = self.n_overflow as usize; + pos_end = self.n_overflow; } else { - // Binary search in the L1-resident sparse index. - let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); + let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); pos_start = self.index[i].1 as usize; pos_end = if i + 1 < self.index.len() { self.index[i + 1].1 as usize } else { - self.n_overflow as usize + self.n_overflow }; } - // Binary search within the block. - let block = Self::data_slice(raw, pos_start, pos_end); - match block.binary_search_by_key(&slot, |&(s, _)| s) { - Ok(i) => block[i].1, - Err(_) => panic!("slot {slot} marked as overflow but not found in data"), + let mut lo = pos_start; + let mut hi = pos_end; + while lo < hi { + let mid = lo + (hi - lo) / 2; + match self.data_slot(mid).cmp(&slot) { + std::cmp::Ordering::Equal => return self.data_value(mid), + std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Greater => hi = mid, + } } + panic!("slot {slot} marked as overflow but not found") } - /// Interpret a byte slice as a slice of (u32, u32) pairs without copying. - fn data_slice(raw: &[u8], from: usize, to: usize) -> &[(u32, u32)] { - let byte_start = from * 8; - let byte_end = to * 8; - let bytes = &raw[byte_start..byte_end]; - // Safety: the file was written as LE u32 pairs; alignment is guaranteed - // because we read from a byte slice and cast explicitly. - let ptr = bytes.as_ptr() as *const (u32, u32); - unsafe { std::slice::from_raw_parts(ptr, to - from) } + #[inline] + fn data_slot(&self, i: usize) -> u32 { + let off = self.data_offset + i * 8; + u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) + } + + #[inline] + fn data_value(&self, i: usize) -> u32 { + let off = self.data_offset + i * 8 + 4; + u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) } } diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 581c809..ec9c42d 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -2,20 +2,15 @@ use tempfile::tempdir; use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; -fn primary(dir: &std::path::Path) -> std::path::PathBuf { dir.join("primary.bin") } -fn overflow(dir: &std::path::Path) -> std::path::PathBuf { dir.join("overflow.bin") } - fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec { - let dir = tempdir().unwrap(); + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(n); for &(slot, v) in values { b.set(slot, v); } - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - - let ov_path = ov.exists().then_some(ov.as_path()); - let r = PersistentCompactIntVec::open(&primary(dir.path()), ov_path).unwrap(); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); (0..n as u64).map(|s| r.get(s)).collect() } @@ -26,17 +21,16 @@ fn all_zero_by_default() { } #[test] -fn small_values_no_overflow_file() { - let dir = tempdir().unwrap(); +fn small_values_no_overflow() { + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(4); b.set(0, 1); b.set(1, 254); b.set(2, 0); b.set(3, 100); - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - assert!(!ov.exists(), "no overflow file expected"); - let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap(); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.get(0), 1); assert_eq!(r.get(1), 254); assert_eq!(r.get(2), 0); @@ -55,43 +49,40 @@ fn overflow_values_roundtrip() { #[test] fn mutation_downward_removes_from_overflow() { - let dir = tempdir().unwrap(); + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(2); - b.set(0, 1000); // goes to overflow - b.set(0, 42); // comes back below threshold - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - assert!(!ov.exists(), "no overflow file expected after downward mutation"); - let r = PersistentCompactIntVec::open(&primary(dir.path()), None).unwrap(); + b.set(0, 1000); + b.set(0, 42); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.get(0), 42); + assert_eq!(r.step, 0, "no overflow entries expected"); } #[test] fn mutation_upward_updates_overflow() { - let dir = tempdir().unwrap(); + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(1); b.set(0, 300); b.set(0, 500); - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap(); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.get(0), 500); } #[test] fn sparse_index_built_for_many_overflows() { - // Generate more than L1_INDEX_ENTRIES (4096) overflow entries to trigger the sparse index. - let n = 5000usize; - let dir = tempdir().unwrap(); + let n = 5000usize; + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(n); for i in 0..n { - b.set(i as u64, 1000 + i as u32); // all ≥ 255 + b.set(i as u64, 1000 + i as u32); } - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - assert!(ov.exists()); - - let r = PersistentCompactIntVec::open(&primary(dir.path()), Some(&ov)).unwrap(); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); assert!(r.step > 0, "sparse index should have been built"); for i in 0..n { assert_eq!(r.get(i as u64), 1000 + i as u32, "mismatch at slot {i}"); @@ -100,18 +91,16 @@ fn sparse_index_built_for_many_overflows() { #[test] fn mixed_large_dataset() { - // Mirrors realistic distribution: most values small, sparse overflows. - let n = 1000usize; - let dir = tempdir().unwrap(); + let n = 1000usize; + let dir = tempdir().unwrap(); + let path = dir.path().join("test.pciv"); let mut b = PersistentCompactIntVecBuilder::new(n); for i in 0..n { let v = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 }; b.set(i as u64, v); } - let ov = overflow(dir.path()); - b.close(&primary(dir.path()), &ov).unwrap(); - - let r = PersistentCompactIntVec::open(&primary(dir.path()), ov.exists().then_some(ov.as_path())).unwrap(); + b.close(&path).unwrap(); + let r = PersistentCompactIntVec::open(&path).unwrap(); for i in 0..n { let expected = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 }; assert_eq!(r.get(i as u64), expected, "slot {i}");