feat: add PersistentBitVec and upgrade PersistentCompactIntVec format
Introduces PersistentBitVec, a dense, memory-mapped bit vector optimized for bulk u64-word operations and SIMD acceleration, complete with bitwise operators and Jaccard/Hamming distance metrics. Upgrades PersistentCompactIntVec to a unified .pciv format using 64-bit indices and offsets, consolidating the binary layout and updating builder/reader lifecycles accordingly. Adds corresponding documentation, updates MkDocs navigation, and implements a comprehensive test suite for persistence round-trips, edge cases, and metric accuracy.
This commit is contained in:
@@ -0,0 +1,241 @@
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{self, Seek, SeekFrom, Write as _};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::{Mmap, MmapMut};
|
||||
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
const MAGIC: [u8; 4] = *b"PBIV";
|
||||
|
||||
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
||||
// Data starts at offset 16, which is divisible by 8 → u64-aligned
|
||||
// (mmap base is page-aligned, 16 % 8 == 0).
|
||||
const HEADER_SIZE: usize = 16;
|
||||
|
||||
#[inline]
|
||||
fn n_words(n: usize) -> usize { n.div_ceil(64) }
|
||||
|
||||
#[inline]
|
||||
fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }
|
||||
|
||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitVec {
|
||||
mmap: Mmap,
|
||||
n: usize,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl PersistentBitVec {
|
||||
pub fn open(path: &Path) -> io::Result<Self> {
|
||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||
if mmap.len() < HEADER_SIZE {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
|
||||
}
|
||||
if &mmap[0..4] != &MAGIC {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
||||
}
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||
}
|
||||
|
||||
pub fn path(&self) -> &Path { &self.path }
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||
}
|
||||
|
||||
// Used by iter() and get(): exact byte window, no padding.
|
||||
fn data_bytes(&self) -> &[u8] {
|
||||
&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)]
|
||||
}
|
||||
|
||||
// Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
|
||||
// so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
|
||||
fn data_words(&self) -> &[u64] {
|
||||
let nw = n_words(self.n);
|
||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||
}
|
||||
|
||||
pub fn count_ones(&self) -> u64 {
|
||||
// Padding bits in the last word are 0, so no masking needed.
|
||||
self.data_words().iter().map(|w| w.count_ones() as u64).sum()
|
||||
}
|
||||
|
||||
pub fn count_zeros(&self) -> u64 {
|
||||
self.n as u64 - self.count_ones()
|
||||
}
|
||||
|
||||
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
let (inter, union) = self.data_words().iter().zip(other.data_words()).fold(
|
||||
(0u64, 0u64),
|
||||
|(i, u), (&a, &b)| (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64),
|
||||
);
|
||||
if union == 0 { return 0.0; }
|
||||
1.0 - inter as f64 / union as f64
|
||||
}
|
||||
|
||||
pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
self.data_words().iter().zip(other.data_words())
|
||||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||||
.sum()
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> BitIter<'_> {
|
||||
BitIter { bytes: self.data_bytes(), slot: 0, n: self.n }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a PersistentBitVec {
|
||||
type Item = bool;
|
||||
type IntoIter = BitIter<'a>;
|
||||
fn into_iter(self) -> BitIter<'a> { self.iter() }
|
||||
}
|
||||
|
||||
pub struct BitIter<'a> {
|
||||
bytes: &'a [u8],
|
||||
slot: usize,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl ExactSizeIterator for BitIter<'_> {}
|
||||
|
||||
impl Iterator for BitIter<'_> {
|
||||
type Item = bool;
|
||||
|
||||
fn next(&mut self) -> Option<bool> {
|
||||
if self.slot >= self.n { return None; }
|
||||
let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0;
|
||||
self.slot += 1;
|
||||
Some(v)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitVecBuilder {
|
||||
mmap: MmapMut,
|
||||
n: usize,
|
||||
}
|
||||
|
||||
impl PersistentBitVecBuilder {
|
||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?; // padding
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
Ok(Self { mmap, n })
|
||||
}
|
||||
|
||||
pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
|
||||
fs::copy(source.path(), path)?;
|
||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
let n = source.len();
|
||||
Ok(Self { mmap, n })
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
|
||||
pub fn get(&self, slot: usize) -> bool {
|
||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||
}
|
||||
|
||||
pub fn set(&mut self, slot: usize, value: bool) {
|
||||
let byte = HEADER_SIZE + (slot >> 3);
|
||||
let bit = 1u8 << (slot & 7);
|
||||
if value { self.mmap[byte] |= bit; } else { self.mmap[byte] &= !bit; }
|
||||
}
|
||||
|
||||
// SAFETY: same alignment argument as PersistentBitVec::data_words.
|
||||
fn data_words_mut(&mut self) -> &mut [u64] {
|
||||
let nw = n_words(self.n);
|
||||
let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
||||
}
|
||||
|
||||
pub fn and(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw &= ow; }
|
||||
}
|
||||
|
||||
pub fn or(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw |= ow; }
|
||||
}
|
||||
|
||||
pub fn xor(&mut self, other: &PersistentBitVec) {
|
||||
assert_eq!(self.n, other.n, "length mismatch");
|
||||
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { *sw ^= ow; }
|
||||
}
|
||||
|
||||
pub fn not(&mut self) {
|
||||
let rem = self.n % 64;
|
||||
let words = self.data_words_mut();
|
||||
for w in words.iter_mut() { *w ^= u64::MAX; }
|
||||
// Zero padding bits in the last word so count_ones / jaccard remain correct.
|
||||
if rem != 0 {
|
||||
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a count vector to a bit vector: bit set iff count >= threshold.
|
||||
/// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
|
||||
pub fn build_from_counts(
|
||||
source: &PersistentCompactIntVec,
|
||||
threshold: u32,
|
||||
path: &Path,
|
||||
) -> io::Result<Self> {
|
||||
let n = source.len();
|
||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||
let mut file = OpenOptions::new()
|
||||
.read(true).write(true).create(true).truncate(true)
|
||||
.open(path)?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.set_len(file_size as u64)?;
|
||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
|
||||
{
|
||||
let nw = n_words(n);
|
||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||
for (slot, count) in source.iter().enumerate() {
|
||||
if count >= threshold {
|
||||
words[slot >> 6] |= 1u64 << (slot & 63);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self { mmap, n })
|
||||
}
|
||||
|
||||
/// Convert a count vector to a presence/absence bit vector (threshold = 1).
|
||||
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||
Self::build_from_counts(source, 1, path)
|
||||
}
|
||||
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.mmap.flush()
|
||||
}
|
||||
}
|
||||
@@ -5,23 +5,33 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::MmapMut;
|
||||
|
||||
use crate::format::{finalize_pciv, HEADER_SIZE};
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv};
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
pub struct PersistentCompactIntVecBuilder {
|
||||
path: PathBuf,
|
||||
mmap: MmapMut,
|
||||
n: usize,
|
||||
overflow: HashMap<u64, u32>,
|
||||
path: PathBuf,
|
||||
mmap: MmapMut,
|
||||
n: usize,
|
||||
overflow: HashMap<usize, u32>,
|
||||
}
|
||||
|
||||
impl PersistentCompactIntVecBuilder {
|
||||
/// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
|
||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||
let file = OpenOptions::new().read(true).write(true).create(true).truncate(true).open(path)?;
|
||||
let file = OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(path)?;
|
||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
|
||||
Ok(Self {
|
||||
path: path.to_path_buf(),
|
||||
mmap,
|
||||
n,
|
||||
overflow: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
|
||||
@@ -32,47 +42,60 @@ impl PersistentCompactIntVecBuilder {
|
||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||
|
||||
let n = source.len();
|
||||
let n_overflow = u32::from_le_bytes(mmap[12..16].try_into().unwrap()) as usize;
|
||||
let n = source.len();
|
||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||
let data_offset = HEADER_SIZE + n;
|
||||
|
||||
let mut overflow = HashMap::with_capacity(n_overflow);
|
||||
for i in 0..n_overflow {
|
||||
let off = data_offset + i * 8;
|
||||
let slot = u32::from_le_bytes(mmap[off..off + 4].try_into().unwrap()) as u64;
|
||||
let value = u32::from_le_bytes(mmap[off + 4..off + 8].try_into().unwrap());
|
||||
let off = data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
||||
let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap());
|
||||
overflow.insert(slot, value);
|
||||
}
|
||||
|
||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||
Ok(Self {
|
||||
path: path.to_path_buf(),
|
||||
mmap,
|
||||
n,
|
||||
overflow,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get(&self, slot: u64) -> u32 {
|
||||
match self.mmap[HEADER_SIZE + slot as usize] {
|
||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
/// Get the value at the given slot, handling overflow if necessary.
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.mmap[HEADER_SIZE + slot] {
|
||||
255 => *self
|
||||
.overflow
|
||||
.get(&slot)
|
||||
.expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set(&mut self, slot: u64, value: u32) {
|
||||
pub fn set(&mut self, slot: usize, value: u32) {
|
||||
if value < 255 {
|
||||
self.mmap[HEADER_SIZE + slot as usize] = value as u8;
|
||||
self.mmap[HEADER_SIZE + slot] = value as u8;
|
||||
self.overflow.remove(&slot);
|
||||
} else {
|
||||
self.mmap[HEADER_SIZE + slot as usize] = 255;
|
||||
self.mmap[HEADER_SIZE + slot] = 255;
|
||||
self.overflow.insert(slot, value);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize { self.n }
|
||||
pub fn len(&self) -> usize {
|
||||
self.n
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.n == 0
|
||||
}
|
||||
|
||||
pub fn min(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
if other_val < self.get(slot as u64) {
|
||||
self.set(slot as u64, other_val);
|
||||
if other_val < self.get(slot) {
|
||||
self.set(slot, other_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,8 +103,8 @@ impl PersistentCompactIntVecBuilder {
|
||||
pub fn max(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
if other_val > self.get(slot as u64) {
|
||||
self.set(slot as u64, other_val);
|
||||
if other_val > self.get(slot) {
|
||||
self.set(slot, other_val);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -89,28 +112,30 @@ impl PersistentCompactIntVecBuilder {
|
||||
pub fn add(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
let cur = self.get(slot as u64);
|
||||
self.set(slot as u64, cur.checked_add(other_val).expect("u32 overflow in add"));
|
||||
let cur = self.get(slot);
|
||||
self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn diff(&mut self, other: &PersistentCompactIntVec) {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
for (slot, other_val) in other.iter().enumerate() {
|
||||
self.set(slot as u64, self.get(slot as u64).saturating_sub(other_val));
|
||||
self.set(slot, self.get(slot).saturating_sub(other_val));
|
||||
}
|
||||
}
|
||||
|
||||
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.mmap.flush()?;
|
||||
let Self { path, mmap, n, overflow } = self;
|
||||
let Self {
|
||||
path,
|
||||
mmap,
|
||||
n,
|
||||
overflow,
|
||||
} = self;
|
||||
drop(mmap);
|
||||
|
||||
let mut entries: Vec<(u32, u32)> = overflow
|
||||
.into_iter()
|
||||
.map(|(slot, val)| (slot as u32, val))
|
||||
.collect();
|
||||
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
||||
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
||||
|
||||
finalize_pciv(&path, n, &entries)
|
||||
|
||||
@@ -3,18 +3,26 @@ use std::io::{self, BufWriter, Seek, SeekFrom, Write as _};
|
||||
use std::path::Path;
|
||||
|
||||
pub const MAGIC: [u8; 4] = *b"PCIV";
|
||||
pub const HEADER_SIZE: usize = 24; // magic(4) + n(8) + n_overflow(4) + step(4) + n_index(4)
|
||||
|
||||
// Sparse index target: fits in 32 KB L1 cache / 8 bytes per entry.
|
||||
pub const L1_INDEX_ENTRIES: usize = 4096;
|
||||
// magic(4) + _pad(4) + n(8) + n_overflow(8) + n_index(8) + step(8)
|
||||
pub const HEADER_SIZE: usize = 40;
|
||||
|
||||
// Overflow entry: slot(u64) + value(u32) = 12 bytes.
|
||||
pub const OVERFLOW_ENTRY_SIZE: usize = 12;
|
||||
|
||||
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
||||
pub const INDEX_ENTRY_SIZE: usize = 16;
|
||||
|
||||
// Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
|
||||
pub const L1_INDEX_ENTRIES: usize = 2048;
|
||||
|
||||
/// Finalise a PCIV file whose placeholder header and primary section are already on disk.
|
||||
///
|
||||
/// Truncates the file to `HEADER_SIZE + n`, then appends:
|
||||
/// data n_overflow × 8 B (slot: u32, value: u32) sorted by slot
|
||||
/// index n_index × 8 B (slot: u32, pos: u32) sparse index
|
||||
/// and overwrites the header placeholder at offset 0.
|
||||
pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Result<()> {
|
||||
/// overflow n_overflow × 12 B (slot: u64, value: u32) sorted by slot
|
||||
/// index n_index × 16 B (slot: u64, pos: u64) sparse index
|
||||
/// and overwrites the placeholder header at offset 0.
|
||||
pub fn finalize_pciv(path: &Path, n: usize, entries: &[(usize, u32)]) -> io::Result<()> {
|
||||
let n_overflow = entries.len();
|
||||
|
||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||
@@ -24,21 +32,21 @@ pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Resul
|
||||
w.seek(SeekFrom::End(0))?;
|
||||
|
||||
for &(slot, value) in entries {
|
||||
w.write_all(&slot.to_le_bytes())?;
|
||||
w.write_all(&(slot as u64).to_le_bytes())?;
|
||||
w.write_all(&value.to_le_bytes())?;
|
||||
}
|
||||
|
||||
let step: u32 = if n_overflow <= L1_INDEX_ENTRIES {
|
||||
let step: usize = if n_overflow <= L1_INDEX_ENTRIES {
|
||||
0
|
||||
} else {
|
||||
n_overflow.div_ceil(L1_INDEX_ENTRIES) as u32
|
||||
n_overflow.div_ceil(L1_INDEX_ENTRIES)
|
||||
};
|
||||
|
||||
let n_index: u32 = if step > 0 {
|
||||
let count = n_overflow.div_ceil(step as usize) as u32;
|
||||
for (block, chunk) in entries.chunks(step as usize).enumerate() {
|
||||
let slot = chunk[0].0;
|
||||
let pos = (block * step as usize) as u32;
|
||||
let n_index: usize = if step > 0 {
|
||||
let count = n_overflow.div_ceil(step);
|
||||
for (block, chunk) in entries.chunks(step).enumerate() {
|
||||
let slot = chunk[0].0 as u64;
|
||||
let pos = (block * step) as u64;
|
||||
w.write_all(&slot.to_le_bytes())?;
|
||||
w.write_all(&pos.to_le_bytes())?;
|
||||
}
|
||||
@@ -51,9 +59,10 @@ pub fn finalize_pciv(path: &Path, n: usize, entries: &[(u32, u32)]) -> io::Resul
|
||||
let mut file = w.into_inner().map_err(|e| e.into_error())?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.write_all(&MAGIC)?;
|
||||
file.write_all(&[0u8; 4])?;
|
||||
file.write_all(&(n as u64).to_le_bytes())?;
|
||||
file.write_all(&(n_overflow as u32).to_le_bytes())?;
|
||||
file.write_all(&step.to_le_bytes())?;
|
||||
file.write_all(&n_index.to_le_bytes())?;
|
||||
file.write_all(&(n_overflow as u64).to_le_bytes())?;
|
||||
file.write_all(&(n_index as u64).to_le_bytes())?;
|
||||
file.write_all(&(step as u64).to_le_bytes())?;
|
||||
file.flush()
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
mod bitvec;
|
||||
mod builder;
|
||||
mod format;
|
||||
mod reader;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
|
||||
|
||||
+111
-23
@@ -4,16 +4,16 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::format::{HEADER_SIZE, MAGIC};
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
|
||||
|
||||
pub struct PersistentCompactIntVec {
|
||||
mmap: Mmap,
|
||||
n: usize,
|
||||
n_overflow: usize,
|
||||
pub step: u32,
|
||||
index: Vec<(u32, u32)>, // (slot, pos) — L1-resident sparse index
|
||||
primary_offset: usize, // = HEADER_SIZE
|
||||
data_offset: usize, // = HEADER_SIZE + n
|
||||
pub step: usize,
|
||||
index: Vec<(usize, usize)>, // (slot, pos) — L1-resident sparse index
|
||||
primary_offset: usize, // = HEADER_SIZE
|
||||
data_offset: usize, // = HEADER_SIZE + n
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
@@ -28,20 +28,20 @@ impl PersistentCompactIntVec {
|
||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||
}
|
||||
|
||||
let n = u64::from_le_bytes(mmap[4..12].try_into().unwrap()) as usize;
|
||||
let n_overflow = u32::from_le_bytes(mmap[12..16].try_into().unwrap()) as usize;
|
||||
let step = u32::from_le_bytes(mmap[16..20].try_into().unwrap());
|
||||
let n_index = u32::from_le_bytes(mmap[20..24].try_into().unwrap()) as usize;
|
||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||
let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
|
||||
let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
|
||||
|
||||
let primary_offset = HEADER_SIZE;
|
||||
let data_offset = primary_offset + n;
|
||||
let index_offset = data_offset + n_overflow * 8;
|
||||
let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
|
||||
|
||||
let mut index = Vec::with_capacity(n_index);
|
||||
for i in 0..n_index {
|
||||
let off = index_offset + i * 8;
|
||||
let slot = u32::from_le_bytes(mmap[off..off + 4].try_into().unwrap());
|
||||
let pos = u32::from_le_bytes(mmap[off + 4..off + 8].try_into().unwrap());
|
||||
let off = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
}
|
||||
|
||||
@@ -69,14 +69,14 @@ impl PersistentCompactIntVec {
|
||||
self.n == 0
|
||||
}
|
||||
|
||||
pub fn get(&self, slot: u64) -> u32 {
|
||||
match self.mmap[self.primary_offset + slot as usize] {
|
||||
255 => self.overflow_get(slot as u32),
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.mmap[self.primary_offset + slot] {
|
||||
255 => self.overflow_get(slot),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
fn overflow_get(&self, slot: u32) -> u32 {
|
||||
fn overflow_get(&self, slot: usize) -> u32 {
|
||||
let pos_start;
|
||||
let pos_end;
|
||||
|
||||
@@ -85,9 +85,9 @@ impl PersistentCompactIntVec {
|
||||
pos_end = self.n_overflow;
|
||||
} else {
|
||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||
pos_start = self.index[i].1 as usize;
|
||||
pos_start = self.index[i].1;
|
||||
pos_end = if i + 1 < self.index.len() {
|
||||
self.index[i + 1].1 as usize
|
||||
self.index[i + 1].1
|
||||
} else {
|
||||
self.n_overflow
|
||||
};
|
||||
@@ -107,14 +107,14 @@ impl PersistentCompactIntVec {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn data_slot(&self, i: usize) -> u32 {
|
||||
let off = self.data_offset + i * 8;
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
fn data_slot(&self, i: usize) -> usize {
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn data_value(&self, i: usize) -> u32 {
|
||||
let off = self.data_offset + i * 8 + 4;
|
||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
}
|
||||
|
||||
@@ -122,6 +122,94 @@ impl PersistentCompactIntVec {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
}
|
||||
|
||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (sum_min, sum_a, sum_b) = self.iter().zip(other.iter()).fold(
|
||||
(0u64, 0u64, 0u64),
|
||||
|(sm, sa, sb), (a, b)| (sm + a.min(b) as u64, sa + a as u64, sb + b as u64),
|
||||
);
|
||||
let denom = sum_a + sum_b;
|
||||
if denom == 0 { return 0.0; }
|
||||
1.0 - 2.0 * sum_min as f64 / denom as f64
|
||||
}
|
||||
|
||||
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sum_min: f64 = self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
pa.min(pb)
|
||||
})
|
||||
.sum();
|
||||
1.0 - sum_min
|
||||
}
|
||||
|
||||
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
}
|
||||
|
||||
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
}
|
||||
|
||||
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let sum_a = self.sum() as f64;
|
||||
let sum_b = other.sum() as f64;
|
||||
if sum_a == 0.0 && sum_b == 0.0 { return 0.0; }
|
||||
let sq: f64 = self.iter().zip(other.iter())
|
||||
.map(|(a, b)| {
|
||||
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
|
||||
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
|
||||
let d = pa - pb;
|
||||
d * d
|
||||
})
|
||||
.sum();
|
||||
sq.sqrt()
|
||||
}
|
||||
|
||||
pub fn hellinger_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
|
||||
}
|
||||
|
||||
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
||||
assert_eq!(self.n, other.len(), "length mismatch");
|
||||
let (intersection, union) = self.iter().zip(other.iter()).fold(
|
||||
(0u64, 0u64),
|
||||
|(inter, uni), (a, b)| {
|
||||
let ap = a >= threshold;
|
||||
let bp = b >= threshold;
|
||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||
},
|
||||
);
|
||||
if union == 0 { return 0.0; }
|
||||
1.0 - intersection as f64 / union as f64
|
||||
}
|
||||
|
||||
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||
self.threshold_jaccard_dist(other, 1)
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> Iter<'_> {
|
||||
Iter { pciv: self, slot: 0, overflow_pos: 0 }
|
||||
}
|
||||
|
||||
@@ -0,0 +1,216 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
|
||||
fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("v.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::new(bits.len(), &path).unwrap();
|
||||
for (i, &v) in bits.iter().enumerate() {
|
||||
b.set(i, v);
|
||||
}
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
(dir, r)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_false_by_default() {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("v.pbiv");
|
||||
let b = PersistentBitVecBuilder::new(16, &path).unwrap();
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert!(r.iter().all(|v| !v));
|
||||
assert_eq!(r.count_ones(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_get_roundtrip() {
|
||||
let bits = vec![true, false, true, true, false, false, true, false, true];
|
||||
let (_dir, r) = make_bv(&bits);
|
||||
for (i, &expected) in bits.iter().enumerate() {
|
||||
assert_eq!(r.get(i), expected, "slot {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_matches_get() {
|
||||
let bits = vec![true, false, true, false, true, true, false, true, false, true];
|
||||
let (_dir, r) = make_bv(&bits);
|
||||
let via_iter: Vec<bool> = r.iter().collect();
|
||||
let via_get: Vec<bool> = (0..bits.len()).map(|s| r.get(s)).collect();
|
||||
assert_eq!(via_iter, via_get);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_ones_and_zeros() {
|
||||
let bits = vec![true, false, true, true, false];
|
||||
let (_dir, r) = make_bv(&bits);
|
||||
assert_eq!(r.count_ones(), 3);
|
||||
assert_eq!(r.count_zeros(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_byte_aligned_length() {
|
||||
// 13 bits — last byte has 5 padding zeros
|
||||
let bits: Vec<bool> = (0..13).map(|i| i % 3 == 0).collect();
|
||||
let (_dir, r) = make_bv(&bits);
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), bits);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_from_roundtrip() {
|
||||
let bits = vec![true, false, true, false, true, true];
|
||||
let (_da, ra) = make_bv(&bits);
|
||||
let dir_b = tempdir().unwrap();
|
||||
let path_b = dir_b.path().join("b.pbiv");
|
||||
PersistentBitVecBuilder::build_from(&ra, &path_b).unwrap().close().unwrap();
|
||||
let rb = PersistentBitVec::open(&path_b).unwrap();
|
||||
assert_eq!(ra.iter().collect::<Vec<_>>(), rb.iter().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn op_and() {
|
||||
let (_da, ra) = make_bv(&[true, true, false, false]);
|
||||
let (_db, rb) = make_bv(&[true, false, true, false]);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.and(&rb);
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn op_or() {
|
||||
let (_da, ra) = make_bv(&[true, true, false, false]);
|
||||
let (_db, rb) = make_bv(&[true, false, true, false]);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.or(&rb);
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn op_xor() {
|
||||
let (_da, ra) = make_bv(&[true, true, false, false]);
|
||||
let (_db, rb) = make_bv(&[true, false, true, false]);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.xor(&rb);
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn op_not_byte_aligned() {
|
||||
let bits = vec![true, false, true, false, true, false, false, true]; // 8 bits, no padding
|
||||
let (_da, ra) = make_bv(&bits);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.not();
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
let expected: Vec<bool> = bits.iter().map(|&v| !v).collect();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn op_not_non_aligned_no_padding_leak() {
|
||||
// 5 bits: [T,F,T,F,T] → NOT → [F,T,F,T,F]; padding bits must stay 0
|
||||
let bits = vec![true, false, true, false, true];
|
||||
let (_da, ra) = make_bv(&bits);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("out.pbiv");
|
||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||
b.not();
|
||||
b.close().unwrap();
|
||||
let r = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, false, true, false]);
|
||||
// count_ones must reflect exactly the 2 set bits, not 2 + stray padding bits
|
||||
assert_eq!(r.count_ones(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaccard_dist_basic() {
|
||||
// A={0,2} B={0,1} → inter=1 union=3 → dist=2/3
|
||||
let (_da, ra) = make_bv(&[true, false, true, false]);
|
||||
let (_db, rb) = make_bv(&[true, true, false, false]);
|
||||
let d = ra.jaccard_dist(&rb);
|
||||
assert!((d - 2.0 / 3.0).abs() < 1e-12, "got {d}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaccard_dist_identical() {
|
||||
let (_d, r) = make_bv(&[true, false, true]);
|
||||
assert_eq!(r.jaccard_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaccard_dist_disjoint() {
|
||||
let (_da, ra) = make_bv(&[true, false]);
|
||||
let (_db, rb) = make_bv(&[false, true]);
|
||||
assert_eq!(ra.jaccard_dist(&rb), 1.0);
|
||||
}
|
||||
|
||||
fn make_pciv(counts: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("c.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::new(counts.len(), &path).unwrap();
|
||||
for (i, &v) in counts.iter().enumerate() { b.set(i, v); }
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
(dir, r)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_from_presence_basic() {
|
||||
let counts = vec![0u32, 3, 0, 7, 1, 0];
|
||||
let (_dc, rc) = make_pciv(&counts);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("bv.pbiv");
|
||||
PersistentBitVecBuilder::build_from_presence(&rc, &path).unwrap().close().unwrap();
|
||||
let bv = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(bv.iter().collect::<Vec<_>>(), vec![false, true, false, true, true, false]);
|
||||
assert_eq!(bv.count_ones(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_from_counts_with_threshold() {
|
||||
// threshold=5: present if count >= 5
|
||||
let counts = vec![0u32, 3, 5, 10, 4, 6];
|
||||
let (_dc, rc) = make_pciv(&counts);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("bv.pbiv");
|
||||
PersistentBitVecBuilder::build_from_counts(&rc, 5, &path).unwrap().close().unwrap();
|
||||
let bv = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(bv.iter().collect::<Vec<_>>(), vec![false, false, true, true, false, true]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_from_counts_with_overflow_values() {
|
||||
// overflow value (>= 255) must pass threshold
|
||||
let counts = vec![0u32, 100, 1_000_000, 50];
|
||||
let (_dc, rc) = make_pciv(&counts);
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("bv.pbiv");
|
||||
PersistentBitVecBuilder::build_from_counts(&rc, 200, &path).unwrap().close().unwrap();
|
||||
let bv = PersistentBitVec::open(&path).unwrap();
|
||||
assert_eq!(bv.iter().collect::<Vec<_>>(), vec![false, false, true, false]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hamming_dist_basic() {
|
||||
// differ at positions 1 and 2
|
||||
let (_da, ra) = make_bv(&[true, true, false, true]);
|
||||
let (_db, rb) = make_bv(&[true, false, true, true]);
|
||||
assert_eq!(ra.hamming_dist(&rb), 2);
|
||||
}
|
||||
@@ -1,8 +1,10 @@
|
||||
mod bitvec;
|
||||
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||
|
||||
fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec<u32> {
|
||||
fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap();
|
||||
@@ -11,7 +13,7 @@ fn roundtrip(values: &[(u64, u32)], n: usize) -> Vec<u32> {
|
||||
}
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
(0..n as u64).map(|s| r.get(s)).collect()
|
||||
(0..n).map(|s| r.get(s)).collect()
|
||||
}
|
||||
|
||||
fn make_pciv(values: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) {
|
||||
@@ -19,7 +21,7 @@ fn make_pciv(values: &[u32]) -> (tempfile::TempDir, PersistentCompactIntVec) {
|
||||
let path = dir.path().join("v.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::new(values.len(), &path).unwrap();
|
||||
for (i, &v) in values.iter().enumerate() {
|
||||
b.set(i as u64, v);
|
||||
b.set(i, v);
|
||||
}
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
@@ -91,19 +93,19 @@ fn sparse_index_built_for_many_overflows() {
|
||||
let path = dir.path().join("test.pciv");
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap();
|
||||
for i in 0..n {
|
||||
b.set(i as u64, 1000 + i as u32);
|
||||
b.set(i, 1000 + i as u32);
|
||||
}
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
assert!(r.step > 0, "sparse index should have been built");
|
||||
for i in 0..n {
|
||||
assert_eq!(r.get(i as u64), 1000 + i as u32, "mismatch at slot {i}");
|
||||
assert_eq!(r.get(i), 1000 + i as u32, "mismatch at slot {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_matches_get() {
|
||||
let values = [(0u64, 255u32), (1, 1000), (2, 50), (3, 1_313_691), (4, 7), (5, 42)];
|
||||
let values = [(0usize, 255u32), (1, 1000), (2, 50), (3, 1_313_691), (4, 7), (5, 42)];
|
||||
let n = 6;
|
||||
let dir = tempdir().unwrap();
|
||||
let path = dir.path().join("test.pciv");
|
||||
@@ -115,7 +117,7 @@ fn iter_matches_get() {
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
|
||||
let via_iter: Vec<u32> = r.iter().collect();
|
||||
let via_get: Vec<u32> = (0..n as u64).map(|s| r.get(s)).collect();
|
||||
let via_get: Vec<u32> = (0..n).map(|s| r.get(s)).collect();
|
||||
assert_eq!(via_iter, via_get);
|
||||
}
|
||||
|
||||
@@ -222,6 +224,106 @@ fn combine_diff() {
|
||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bray_dist_basic() {
|
||||
// min=[1,0,0], sum_a=2, sum_b=2 → 1 - 2*1/4 = 0.5
|
||||
let (_da, ra) = make_pciv(&[1, 0, 1]);
|
||||
let (_db, rb) = make_pciv(&[1, 1, 0]);
|
||||
let d = ra.bray_dist(&rb);
|
||||
assert!((d - 0.5).abs() < 1e-12, "got {d}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bray_dist_identical() {
|
||||
let (_d, r) = make_pciv(&[3, 7, 2]);
|
||||
assert_eq!(r.bray_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bray_dist_disjoint() {
|
||||
// no overlap → min=0 → BC=1
|
||||
let (_da, ra) = make_pciv(&[1, 0]);
|
||||
let (_db, rb) = make_pciv(&[0, 1]);
|
||||
assert_eq!(ra.bray_dist(&rb), 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn relfreq_bray_dist_basic() {
|
||||
// [2,0] vs [0,2]: rel_freqs [1,0] vs [0,1], min=0 → dist=1
|
||||
let (_da, ra) = make_pciv(&[2, 0]);
|
||||
let (_db, rb) = make_pciv(&[0, 2]);
|
||||
assert_eq!(ra.relfreq_bray_dist(&rb), 1.0);
|
||||
// [2,0] vs [1,0]: same direction → dist=0
|
||||
let (_dc, rc) = make_pciv(&[1, 0]);
|
||||
assert_eq!(ra.relfreq_bray_dist(&rc), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn euclidean_dist_basic() {
|
||||
// [3,4] vs [0,0] → 5
|
||||
let (_da, ra) = make_pciv(&[3, 4]);
|
||||
let (_db, rb) = make_pciv(&[0, 0]);
|
||||
assert!((ra.euclidean_dist(&rb) - 5.0).abs() < 1e-12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn euclidean_dist_identical() {
|
||||
let (_d, r) = make_pciv(&[3, 7, 2]);
|
||||
assert_eq!(r.euclidean_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hellinger_euclidean_dist_disjoint() {
|
||||
// [4,0] vs [0,4]: sqrt(relfreq) = [1,0] vs [0,1] → euclidean = sqrt(2)
|
||||
let (_da, ra) = make_pciv(&[4, 0]);
|
||||
let (_db, rb) = make_pciv(&[0, 4]);
|
||||
assert!((ra.hellinger_euclidean_dist(&rb) - 2f64.sqrt()).abs() < 1e-12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hellinger_euclidean_dist_identical() {
|
||||
let (_d, r) = make_pciv(&[3, 7, 2]);
|
||||
assert_eq!(r.hellinger_euclidean_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaccard_dist_basic() {
|
||||
// ref={0,2,3} other={0,1,2} → inter=2 union=4 → dist=0.5
|
||||
let (_da, ra) = make_pciv(&[1, 0, 1, 1]);
|
||||
let (_db, rb) = make_pciv(&[1, 1, 1, 0]);
|
||||
assert!((ra.jaccard_dist(&rb) - 0.5).abs() < 1e-12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn jaccard_dist_identical() {
|
||||
let (_d, r) = make_pciv(&[3, 0, 7]);
|
||||
assert_eq!(r.jaccard_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hellinger_dist_disjoint() {
|
||||
// disjoint → hellinger_euclidean = sqrt(2) → hellinger = 1.0
|
||||
let (_da, ra) = make_pciv(&[4, 0]);
|
||||
let (_db, rb) = make_pciv(&[0, 4]);
|
||||
assert!((ra.hellinger_dist(&rb) - 1.0).abs() < 1e-12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hellinger_dist_identical() {
|
||||
let (_d, r) = make_pciv(&[3, 7, 2]);
|
||||
assert_eq!(r.hellinger_dist(&r), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn threshold_jaccard_dist_with_threshold() {
|
||||
// threshold=3: ref={0,2} (values 5,4≥3) other={1,2} (values 3,4≥3)
|
||||
// inter={2}=1, union={0,1,2}=3 → dist=2/3
|
||||
let (_da, ra) = make_pciv(&[5, 1, 4]);
|
||||
let (_db, rb) = make_pciv(&[2, 3, 4]);
|
||||
let d = ra.threshold_jaccard_dist(&rb, 3);
|
||||
assert!((d - 2.0 / 3.0).abs() < 1e-12, "got {d}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn mixed_large_dataset() {
|
||||
let n = 1000usize;
|
||||
@@ -230,12 +332,12 @@ fn mixed_large_dataset() {
|
||||
let mut b = PersistentCompactIntVecBuilder::new(n, &path).unwrap();
|
||||
for i in 0..n {
|
||||
let v = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
|
||||
b.set(i as u64, v);
|
||||
b.set(i, v);
|
||||
}
|
||||
b.close().unwrap();
|
||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||
for i in 0..n {
|
||||
let expected = if i % 100 == 0 { 100_000 + i as u32 } else { i as u32 % 200 };
|
||||
assert_eq!(r.get(i as u64), expected, "slot {i}");
|
||||
assert_eq!(r.get(i), expected, "slot {i}");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user