0b3fcf3cf0
Introduces PersistentBitVec, a dense, memory-mapped bit vector optimized for bulk u64-word operations and SIMD acceleration, complete with bitwise operators and Jaccard/Hamming distance metrics. Upgrades PersistentCompactIntVec to a unified .pciv format using 64-bit indices and offsets, consolidating the binary layout and updating builder/reader lifecycles accordingly. Adds corresponding documentation, updates MkDocs navigation, and implements a comprehensive test suite for persistence round-trips, edge cases, and metric accuracy.
69 lines
2.2 KiB
Rust
69 lines
2.2 KiB
Rust
use std::fs::OpenOptions;
|
||
use std::io::{self, BufWriter, Seek, SeekFrom, Write as _};
|
||
use std::path::Path;
|
||
|
||
pub const MAGIC: [u8; 4] = *b"PCIV";
|
||
|
||
// magic(4) + _pad(4) + n(8) + n_overflow(8) + n_index(8) + step(8)
|
||
pub const HEADER_SIZE: usize = 40;
|
||
|
||
// Overflow entry: slot(u64) + value(u32) = 12 bytes.
|
||
pub const OVERFLOW_ENTRY_SIZE: usize = 12;
|
||
|
||
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
||
pub const INDEX_ENTRY_SIZE: usize = 16;
|
||
|
||
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
|
||
pub const L1_INDEX_ENTRIES: usize = 2048;
|
||
|
||
/// Finalise a PCIV file whose placeholder header and primary section are already on disk.
|
||
///
|
||
/// Truncates the file to `HEADER_SIZE + n`, then appends:
|
||
/// overflow n_overflow × 12 B (slot: u64, value: u32) sorted by slot
|
||
/// index n_index × 16 B (slot: u64, pos: u64) sparse index
|
||
/// and overwrites the placeholder header at offset 0.
|
||
pub fn finalize_pciv(path: &Path, n: usize, entries: &[(usize, u32)]) -> io::Result<()> {
|
||
let n_overflow = entries.len();
|
||
|
||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||
file.set_len((HEADER_SIZE + n) as u64)?;
|
||
|
||
let mut w = BufWriter::new(file);
|
||
w.seek(SeekFrom::End(0))?;
|
||
|
||
for &(slot, value) in entries {
|
||
w.write_all(&(slot as u64).to_le_bytes())?;
|
||
w.write_all(&value.to_le_bytes())?;
|
||
}
|
||
|
||
let step: usize = if n_overflow <= L1_INDEX_ENTRIES {
|
||
0
|
||
} else {
|
||
n_overflow.div_ceil(L1_INDEX_ENTRIES)
|
||
};
|
||
|
||
let n_index: usize = if step > 0 {
|
||
let count = n_overflow.div_ceil(step);
|
||
for (block, chunk) in entries.chunks(step).enumerate() {
|
||
let slot = chunk[0].0 as u64;
|
||
let pos = (block * step) as u64;
|
||
w.write_all(&slot.to_le_bytes())?;
|
||
w.write_all(&pos.to_le_bytes())?;
|
||
}
|
||
count
|
||
} else {
|
||
0
|
||
};
|
||
|
||
w.flush()?;
|
||
let mut file = w.into_inner().map_err(|e| e.into_error())?;
|
||
file.seek(SeekFrom::Start(0))?;
|
||
file.write_all(&MAGIC)?;
|
||
file.write_all(&[0u8; 4])?;
|
||
file.write_all(&(n as u64).to_le_bytes())?;
|
||
file.write_all(&(n_overflow as u64).to_le_bytes())?;
|
||
file.write_all(&(n_index as u64).to_le_bytes())?;
|
||
file.write_all(&(step as u64).to_le_bytes())?;
|
||
file.flush()
|
||
}
|