Files
obikmer/src/obicompactvec/src/format.rs
T
Eric Coissac 0b3fcf3cf0 feat: add PersistentBitVec and upgrade PersistentCompactIntVec format
Introduces PersistentBitVec, a dense, memory-mapped bit vector optimized for bulk u64-word operations and SIMD acceleration, complete with bitwise operators and Jaccard/Hamming distance metrics. Upgrades PersistentCompactIntVec to a unified .pciv format using 64-bit indices and offsets, consolidating the binary layout and updating builder/reader lifecycles accordingly. Adds corresponding documentation, updates MkDocs navigation, and implements a comprehensive test suite for persistence round-trips, edge cases, and metric accuracy.
2026-05-14 09:01:36 +08:00

69 lines
2.2 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
use std::fs::OpenOptions;
use std::io::{self, BufWriter, Seek, SeekFrom, Write as _};
use std::path::Path;
pub const MAGIC: [u8; 4] = *b"PCIV";
// magic(4) + _pad(4) + n(8) + n_overflow(8) + n_index(8) + step(8)
pub const HEADER_SIZE: usize = 40;
// Overflow entry: slot(u64) + value(u32) = 12 bytes.
pub const OVERFLOW_ENTRY_SIZE: usize = 12;
// Index entry: slot(u64) + pos(u64) = 16 bytes.
pub const INDEX_ENTRY_SIZE: usize = 16;
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
pub const L1_INDEX_ENTRIES: usize = 2048;
/// Finalise a PCIV file whose placeholder header and primary section are already on disk.
///
/// Truncates the file to `HEADER_SIZE + n`, then appends:
/// overflow n_overflow × 12 B (slot: u64, value: u32) sorted by slot
/// index n_index × 16 B (slot: u64, pos: u64) sparse index
/// and overwrites the placeholder header at offset 0.
pub fn finalize_pciv(path: &Path, n: usize, entries: &[(usize, u32)]) -> io::Result<()> {
let n_overflow = entries.len();
let file = OpenOptions::new().read(true).write(true).open(path)?;
file.set_len((HEADER_SIZE + n) as u64)?;
let mut w = BufWriter::new(file);
w.seek(SeekFrom::End(0))?;
for &(slot, value) in entries {
w.write_all(&(slot as u64).to_le_bytes())?;
w.write_all(&value.to_le_bytes())?;
}
let step: usize = if n_overflow <= L1_INDEX_ENTRIES {
0
} else {
n_overflow.div_ceil(L1_INDEX_ENTRIES)
};
let n_index: usize = if step > 0 {
let count = n_overflow.div_ceil(step);
for (block, chunk) in entries.chunks(step).enumerate() {
let slot = chunk[0].0 as u64;
let pos = (block * step) as u64;
w.write_all(&slot.to_le_bytes())?;
w.write_all(&pos.to_le_bytes())?;
}
count
} else {
0
};
w.flush()?;
let mut file = w.into_inner().map_err(|e| e.into_error())?;
file.seek(SeekFrom::Start(0))?;
file.write_all(&MAGIC)?;
file.write_all(&[0u8; 4])?;
file.write_all(&(n as u64).to_le_bytes())?;
file.write_all(&(n_overflow as u64).to_le_bytes())?;
file.write_all(&(n_index as u64).to_le_bytes())?;
file.write_all(&(step as u64).to_le_bytes())?;
file.flush()
}