f91c5a3f79
Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
270 lines
10 KiB
Rust
270 lines
10 KiB
Rust
use std::collections::HashMap;
|
|
use std::fs::{self, OpenOptions};
|
|
use std::io;
|
|
use std::path::{Path, PathBuf};
|
|
|
|
use memmap2::MmapMut;
|
|
|
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
|
|
use crate::reader::PersistentCompactIntVec;
|
|
use crate::views::{BitSliceView, IntSliceView};
|
|
|
|
pub struct PersistentCompactIntVecBuilder {
|
|
path: PathBuf,
|
|
mmap: MmapMut,
|
|
n: usize,
|
|
overflow: HashMap<usize, u32>,
|
|
}
|
|
|
|
impl PersistentCompactIntVecBuilder {
|
|
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
|
let file = OpenOptions::new()
|
|
.read(true).write(true).create(true).truncate(true)
|
|
.open(path)?;
|
|
file.set_len((HEADER_SIZE + n) as u64)?;
|
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
|
|
}
|
|
|
|
pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
|
|
let n = primary.len();
|
|
let file = OpenOptions::new()
|
|
.read(true).write(true).create(true).truncate(true)
|
|
.open(path)?;
|
|
file.set_len((HEADER_SIZE + n) as u64)?;
|
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
|
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
|
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
|
}
|
|
|
|
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
|
fs::copy(source.path(), path)?;
|
|
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
|
let n = source.len();
|
|
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
|
let data_offset = HEADER_SIZE + n;
|
|
let mut overflow = HashMap::with_capacity(n_overflow);
|
|
for i in 0..n_overflow {
|
|
let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
|
|
overflow.insert(slot, value);
|
|
}
|
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
|
}
|
|
|
|
pub fn get(&self, slot: usize) -> u32 {
|
|
match self.mmap[HEADER_SIZE + slot] {
|
|
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
|
v => v as u32,
|
|
}
|
|
}
|
|
|
|
pub fn set(&mut self, slot: usize, value: u32) {
|
|
if value < 255 {
|
|
self.mmap[HEADER_SIZE + slot] = value as u8;
|
|
self.overflow.remove(&slot);
|
|
} else {
|
|
self.mmap[HEADER_SIZE + slot] = 255;
|
|
self.overflow.insert(slot, value);
|
|
}
|
|
}
|
|
|
|
pub fn len(&self) -> usize { self.n }
|
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
|
|
|
pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
|
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
|
pub fn clear_overflow(&mut self) { self.overflow.clear(); }
|
|
|
|
pub fn sum(&self) -> u64 {
|
|
byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
|
|
}
|
|
pub fn count_nonzero(&self) -> u64 {
|
|
byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
|
|
}
|
|
|
|
pub fn view(&self) -> IntSliceView<'_> {
|
|
// Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
|
|
// by collecting into a sorted vec and storing in a thread-local buffer.
|
|
// For read-back during building, just call get(slot) directly.
|
|
// view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
|
|
// Here we expose it via a zero-alloc path: primary only, no overflow raw.
|
|
// Callers that need overflow_entries during building use overflow_entries().
|
|
let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
|
|
IntSliceView::new(primary, &[], 0, self.n)
|
|
}
|
|
|
|
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
self.overflow.iter().map(|(&k, &v)| (k, v))
|
|
}
|
|
|
|
pub fn inc(&mut self, slot: usize) {
|
|
let v = self.get(slot);
|
|
self.set(slot, v.saturating_add(1));
|
|
}
|
|
|
|
// ── Computation methods ───────────────────────────────────────────────────
|
|
|
|
/// Increment one counter per 1-bit of `col`. Safe for any group size.
|
|
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
|
let n = self.n;
|
|
for (wi, &word) in col.words().iter().enumerate() {
|
|
if word == 0 { continue; }
|
|
let mut w = word;
|
|
while w != 0 {
|
|
let bit = w.trailing_zeros() as usize;
|
|
let slot = wi * 64 + bit;
|
|
if slot < n { self.inc(slot); }
|
|
w &= w - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
|
|
/// Caller guarantees no counter will reach 255 (group size < 255).
|
|
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
|
{
|
|
let primary = self.primary_bytes_mut();
|
|
let n = primary.len();
|
|
for (wi, &word) in col.words().iter().enumerate() {
|
|
if word == 0 { continue; }
|
|
let mut w = word;
|
|
while w != 0 {
|
|
let bit = w.trailing_zeros() as usize;
|
|
let s = wi * 64 + bit;
|
|
if s < n { primary[s] += 1; }
|
|
w &= w - 1;
|
|
}
|
|
}
|
|
}
|
|
debug_assert!(
|
|
!self.primary_bytes().contains(&255),
|
|
"sentinel 255 reached in inc_present_fast — group size must be < 255"
|
|
);
|
|
}
|
|
|
|
/// Two-pass: primary bytes then overflow. Increments `self[slot]` for each
|
|
/// slot where `pred(col[slot])` is true. Safe for any group size.
|
|
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
|
let n = col.len();
|
|
for slot in 0..n {
|
|
let b = col.primary_bytes()[slot];
|
|
if b < 255 && pred(b as u32) {
|
|
self.inc(slot);
|
|
}
|
|
}
|
|
for (slot, val) in col.overflow_entries() {
|
|
if pred(val) { self.inc(slot); }
|
|
}
|
|
}
|
|
|
|
/// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255.
|
|
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
|
let n = col.len();
|
|
{
|
|
let primary = self.primary_bytes_mut();
|
|
for slot in 0..n {
|
|
let b = col.primary_bytes()[slot];
|
|
if b < 255 && pred(b as u32) {
|
|
primary[slot] += 1;
|
|
}
|
|
}
|
|
}
|
|
for (slot, val) in col.overflow_entries() {
|
|
if pred(val) { self.primary_bytes_mut()[slot] += 1; }
|
|
}
|
|
debug_assert!(
|
|
!self.primary_bytes().contains(&255),
|
|
"sentinel 255 reached in inc_predicate_fast — group size must be < 255"
|
|
);
|
|
}
|
|
|
|
pub fn add(&mut self, other: IntSliceView<'_>) {
|
|
let n = self.n;
|
|
for s in 0..n {
|
|
let sb = self.primary_bytes()[s];
|
|
let ob = other.primary_bytes()[s];
|
|
if sb < 255 && ob < 255 {
|
|
let sum = sb as u32 + ob as u32;
|
|
if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
|
|
else { self.set(s, sum); }
|
|
} else {
|
|
let sv = self.get(s);
|
|
let ov = other.get(s);
|
|
self.set(s, sv + ov);
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn min(&mut self, other: IntSliceView<'_>) {
|
|
let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
|
|
let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
|
|
self.clear_overflow();
|
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
|
if b < *a { *a = b; }
|
|
}
|
|
for (slot, self_val) in self_ov {
|
|
if let Some(&other_val) = other_ov.get(&slot) {
|
|
self.set(slot, self_val.min(other_val));
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn max(&mut self, other: IntSliceView<'_>) {
|
|
for (slot, other_val) in other.overflow_entries() {
|
|
let sv = self.get(slot);
|
|
self.set(slot, sv.max(other_val));
|
|
}
|
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
|
if b > *a { *a = b; }
|
|
}
|
|
}
|
|
|
|
pub fn diff(&mut self, other: IntSliceView<'_>) {
|
|
let n = self.n;
|
|
for s in 0..n {
|
|
let sb = self.primary_bytes()[s];
|
|
let ob = other.primary_bytes()[s];
|
|
if sb < 255 {
|
|
self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
|
|
} else {
|
|
let sv = self.get(s);
|
|
let ov = if ob < 255 { ob as u32 } else { other.get(s) };
|
|
self.set(s, sv.saturating_sub(ov));
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
|
let n = self.n;
|
|
for (wi, &word) in mask.words().iter().enumerate() {
|
|
if word == u64::MAX { continue; }
|
|
let mut zeros = !word;
|
|
while zeros != 0 {
|
|
let bit = zeros.trailing_zeros() as usize;
|
|
let s = wi * 64 + bit;
|
|
if s < n {
|
|
let b = self.primary_bytes()[s];
|
|
if b != 0 { self.set(s, 0); }
|
|
}
|
|
zeros &= zeros - 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn close(self) -> io::Result<()> {
|
|
self.mmap.flush()?;
|
|
let Self { path, mmap, n, overflow } = self;
|
|
drop(mmap);
|
|
let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
|
|
entries.sort_unstable_by_key(|&(slot, _)| slot);
|
|
finalize_pciv(&path, n, &entries)
|
|
}
|
|
|
|
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
|
let path = self.path.clone();
|
|
self.close()?;
|
|
PersistentCompactIntVec::open(&path)
|
|
}
|
|
}
|