diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 8039e29..2717174 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -7,13 +7,12 @@ use ndarray::{Array1, Array2}; use rayon::prelude::*; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; -use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits}; -use crate::memoryvec::MemoryBitVec; -use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; -use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; -use crate::traits::{BitSlice, BitSliceMut, IntSliceMut}; +use crate::colgroup::{ColGroup, MatrixGroupOps}; use crate::layer_meta::LayerMeta; use crate::meta::MatrixMeta; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; +use crate::views::BitSliceView; fn col_path(dir: &Path, col: usize) -> PathBuf { dir.join(format!("col_{col:06}.pbiv")) @@ -143,18 +142,14 @@ impl PackedBitMatrix { unsafe { std::slice::from_raw_parts(ptr, nw) } } - pub(crate) fn col_slice(&self, c: usize) -> PackedCol<'_> { - PackedCol { words: self.col_words(c), n: self.n_rows } + pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> { + BitSliceView::new(self.col_words(c), self.n_rows) } pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path) } - pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec { - MemoryBitVec::from(&self.col_slice(c)) - } - pub(crate) fn count_ones(&self) -> Array1 { Array1::from_vec( (0..self.n_cols).into_par_iter() @@ -165,47 +160,17 @@ impl PackedBitMatrix { pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { pairwise2_matrix(self.n_cols, |i, j| { - self.col_slice(i).partial_jaccard_dist(&self.col_slice(j)) + self.col_slice(i).partial_jaccard_dist(self.col_slice(j)) }) } pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2 { pairwise_matrix(self.n_cols, |i, j| { - self.col_slice(i).hamming_dist(&self.col_slice(j)) + self.col_slice(i).hamming_dist(self.col_slice(j)) }) } } -pub(crate) struct PackedCol<'a> { - words: &'a [u64], - n: usize, -} - -impl BitSlice for PackedCol<'_> { - fn len(&self) -> usize { self.n } - fn words(&self) -> &[u64] { self.words } -} - -// ── BitColView — uniform column access across Columnar and Packed ───────────── - -enum BitColViewInner<'a> { - Columnar(&'a PersistentBitVec), - Packed(PackedCol<'a>), -} - -/// Opaque column view returned by [`PersistentBitMatrix::col_view`]. -/// Implements [`BitSlice`] uniformly for both Columnar and Packed matrix formats. -pub struct BitColView<'a>(BitColViewInner<'a>); - -impl BitSlice for BitColView<'_> { - fn len(&self) -> usize { - match &self.0 { BitColViewInner::Columnar(c) => c.len(), BitColViewInner::Packed(c) => c.len() } - } - fn words(&self) -> &[u64] { - match &self.0 { BitColViewInner::Columnar(c) => c.words(), BitColViewInner::Packed(c) => c.words() } - } -} - /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files. pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pbmx"); @@ -321,10 +286,10 @@ impl PersistentBitMatrix { } } - pub fn col_view(&self, c: usize) -> BitColView<'_> { + pub fn col_view(&self, c: usize) -> BitSliceView<'_> { match self { - Self::Columnar(m) => BitColView(BitColViewInner::Columnar(m.col(c))), - Self::Packed(m) => BitColView(BitColViewInner::Packed(m.col_slice(c))), + Self::Columnar(m) => m.col(c).view(), + Self::Packed(m) => m.col_slice(c), Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"), } } @@ -341,14 +306,6 @@ impl PersistentBitMatrix { } } - pub fn col_as_memory(&self, c: usize) -> MemoryBitVec { - match self { - Self::Columnar(m) => MemoryBitVec::from(m.col(c)), - Self::Packed(m) => m.col_as_memory(c), - Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows), - } - } - pub fn row(&self, slot: usize) -> Box<[bool]> { match self { Self::Columnar(m) => m.row(slot), @@ -458,27 +415,19 @@ impl MatrixGroupOps for PersistentBitMatrix { let n = self.n(); if g.indices.len() < 255 { let mut builder = TempCompactIntVecBuilder::new(n)?; - { - let primary = builder.primary_bytes_mut(); - for &c in &g.indices { - let mbv = MemoryBitVec::from(&self.col_view(c)); - inc_primary_bits(primary, &mbv); - } + for &c in &g.indices { + builder.inc_present_fast(self.col_view(c)); } builder.freeze() } else { let mut result = TempCompactIntVecBuilder::new(n)?; for chunk in g.indices.chunks(254) { - let mut chunk_builder = TempCompactIntVecBuilder::new(n)?; - { - let primary = chunk_builder.primary_bytes_mut(); - for &c in chunk { - let mbv = MemoryBitVec::from(&self.col_view(c)); - inc_primary_bits(primary, &mbv); - } + let mut chunk_b = TempCompactIntVecBuilder::new(n)?; + for &c in chunk { + chunk_b.inc_present_fast(self.col_view(c)); } - let chunk_frozen = chunk_builder.freeze()?; - IntSliceMut::add(&mut result, &chunk_frozen); + let frozen = chunk_b.freeze()?; + result.add(frozen.view()); } result.freeze() } @@ -493,7 +442,7 @@ impl MatrixGroupOps for PersistentBitMatrix { let n = self.n(); let mut result = TempBitVecBuilder::new(n)?; for &c in &g.indices { - result.or(&self.col_view(c)); + result.or(self.col_view(c)); } result.freeze() } diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs index 1d91b10..8cde36b 100644 --- a/src/obicompactvec/src/bitvec.rs +++ b/src/obicompactvec/src/bitvec.rs @@ -5,29 +5,25 @@ use std::path::{Path, PathBuf}; use memmap2::{Mmap, MmapMut}; use crate::reader::PersistentCompactIntVec; +use crate::views::{BitSliceView, BitSliceIter}; const MAGIC: [u8; 4] = *b"PBIV"; // Header: magic(4) + _pad(4) + n(8) = 16 bytes. -// Data starts at offset 16, which is divisible by 8 → u64-aligned -// (mmap base is page-aligned, 16 % 8 == 0). +// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0). const HEADER_SIZE: usize = 16; #[inline] -pub(crate) fn n_words(n: usize) -> usize { - n.div_ceil(64) -} +pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) } #[inline] -fn n_bytes_for_words(n: usize) -> usize { - n_words(n) * 8 -} +fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 } -// ── Reader ──────────────────────────────────────────────────────────────────── +// ── PersistentBitVec ────────────────────────────────────────────────────────── pub struct PersistentBitVec { mmap: Mmap, - n: usize, + n: usize, path: PathBuf, } @@ -35,44 +31,49 @@ impl PersistentBitVec { pub fn open(path: &Path) -> io::Result { let mmap = unsafe { Mmap::map(&File::open(path)?)? }; if mmap.len() < HEADER_SIZE { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "PBIV file too short", - )); + return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short")); } if &mmap[0..4] != &MAGIC { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic")); } let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; - Ok(Self { - mmap, - n, - path: path.to_path_buf(), - }) + Ok(Self { mmap, n, path: path.to_path_buf() }) } - pub fn path(&self) -> &Path { - &self.path - } - pub fn len(&self) -> usize { - self.n - } - pub fn is_empty(&self) -> bool { - self.n == 0 - } + pub fn path(&self) -> &Path { &self.path } + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } pub fn get(&self, slot: usize) -> bool { (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 } - // SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8, - // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes. + // SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned. fn data_words(&self) -> &[u64] { - let nw = n_words(self.n); + let nw = n_words(self.n); let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; unsafe { std::slice::from_raw_parts(ptr, nw) } } + pub fn view(&self) -> BitSliceView<'_> { + BitSliceView::new(self.data_words(), self.n) + } + + pub fn words(&self) -> &[u64] { self.data_words() } + + pub fn count_ones(&self) -> u64 { self.view().count_ones() } + pub fn count_zeros(&self) -> u64 { self.view().count_zeros() } + + pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) { + self.view().partial_jaccard_dist(other.view()) + } + pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 { + self.view().jaccard_dist(other.view()) + } + pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 { + self.view().hamming_dist(other.view()) + } + pub fn iter(&self) -> BitIter<'_> { BitIter { words: self.data_words(), slot: 0, n: self.n } } @@ -81,40 +82,38 @@ impl PersistentBitVec { impl<'a> IntoIterator for &'a PersistentBitVec { type Item = bool; type IntoIter = BitIter<'a>; - fn into_iter(self) -> BitIter<'a> { - self.iter() - } + fn into_iter(self) -> BitIter<'a> { self.iter() } } +// ── BitIter ─────────────────────────────────────────────────────────────────── + pub struct BitIter<'a> { pub(crate) words: &'a [u64], - pub(crate) slot: usize, - pub(crate) n: usize, + pub(crate) slot: usize, + pub(crate) n: usize, } impl ExactSizeIterator for BitIter<'_> {} impl Iterator for BitIter<'_> { type Item = bool; - fn next(&mut self) -> Option { if self.slot >= self.n { return None; } let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0; self.slot += 1; Some(v) } - fn size_hint(&self) -> (usize, Option) { let rem = self.n - self.slot; (rem, Some(rem)) } } -// ── Builder ─────────────────────────────────────────────────────────────────── +// ── PersistentBitVecBuilder ─────────────────────────────────────────────────── pub struct PersistentBitVecBuilder { mmap: MmapMut, - n: usize, + n: usize, path: PathBuf, } @@ -122,13 +121,10 @@ impl PersistentBitVecBuilder { pub fn new(n: usize, path: &Path) -> io::Result { let file_size = HEADER_SIZE + n_bytes_for_words(n); let mut file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) + .read(true).write(true).create(true).truncate(true) .open(path)?; file.write_all(&MAGIC)?; - file.write_all(&[0u8; 4])?; // padding + file.write_all(&[0u8; 4])?; file.write_all(&(n as u64).to_le_bytes())?; file.seek(SeekFrom::Start(0))?; file.set_len(file_size as u64)?; @@ -136,8 +132,6 @@ impl PersistentBitVecBuilder { Ok(Self { mmap, n, path: path.to_path_buf() }) } - /// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary. - /// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits. pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result { let file_size = HEADER_SIZE + n_bytes_for_words(n); let file = OpenOptions::new() @@ -159,44 +153,11 @@ impl PersistentBitVecBuilder { Ok(Self { mmap, n, path: path.to_path_buf() }) } - pub fn len(&self) -> usize { - self.n - } - pub fn is_empty(&self) -> bool { - self.n == 0 - } - - pub fn get(&self, slot: usize) -> bool { - (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 - } - - fn data_words(&self) -> &[u64] { - let nw = n_words(self.n); - let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; - unsafe { std::slice::from_raw_parts(ptr, nw) } - } - - // SAFETY: same alignment argument as PersistentBitVec::data_words. - fn data_words_mut(&mut self) -> &mut [u64] { - let nw = n_words(self.n); - let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; - unsafe { std::slice::from_raw_parts_mut(ptr, nw) } - } - - /// Convert a count vector to a bit vector: bit set iff count >= threshold. - /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead. - pub fn build_from_counts( - source: &PersistentCompactIntVec, - threshold: u32, - path: &Path, - ) -> io::Result { + pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result { let n = source.len(); let file_size = HEADER_SIZE + n_bytes_for_words(n); let mut file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) + .read(true).write(true).create(true).truncate(true) .open(path)?; file.write_all(&MAGIC)?; file.write_all(&[0u8; 4])?; @@ -204,52 +165,91 @@ impl PersistentBitVecBuilder { file.seek(SeekFrom::Start(0))?; file.set_len(file_size as u64)?; let mut mmap = unsafe { MmapMut::map_mut(&file)? }; - { - let nw = n_words(n); + let nw = n_words(n); let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) }; for (slot, count) in source.iter().enumerate() { - if count >= threshold { - words[slot >> 6] |= 1u64 << (slot & 63); - } + if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); } } } - Ok(Self { mmap, n, path: path.to_path_buf() }) } - /// Convert a count vector to a presence/absence bit vector (threshold = 1). pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result { Self::build_from_counts(source, 1, path) } - pub fn close(self) -> io::Result<()> { - self.mmap.flush() + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn get(&self, slot: usize) -> bool { + (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 } - /// Flush, close, and reopen as a read-only `PersistentBitVec`. + pub fn set(&mut self, slot: usize, value: bool) { + let bit = 1u64 << (slot & 63); + if value { self.data_words_mut()[slot >> 6] |= bit; } + else { self.data_words_mut()[slot >> 6] &= !bit; } + } + + fn data_words(&self) -> &[u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; + unsafe { std::slice::from_raw_parts(ptr, nw) } + } + + // SAFETY: same alignment argument as PersistentBitVec::data_words. + fn data_words_mut(&mut self) -> &mut [u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; + unsafe { std::slice::from_raw_parts_mut(ptr, nw) } + } + + pub fn view(&self) -> BitSliceView<'_> { + BitSliceView::new(self.data_words(), self.n) + } + + pub fn words(&self) -> &[u64] { self.data_words() } + + pub fn copy_from(&mut self, src: BitSliceView<'_>) { + assert_eq!(self.n, src.len(), "BitSliceView length mismatch"); + self.data_words_mut().copy_from_slice(src.words()); + } + + pub fn and(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; } + } + + pub fn or(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; } + } + + pub fn xor(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; } + } + + pub fn not(&mut self) { + let rem = self.n % 64; + let words = self.data_words_mut(); + for w in words.iter_mut() { *w ^= u64::MAX; } + if rem != 0 { + if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; } + } + } + + pub fn iter(&self) -> BitSliceIter<'_> { + self.view().iter() + } + + pub fn close(self) -> io::Result<()> { self.mmap.flush() } + pub fn finish(self) -> io::Result { let path = self.path.clone(); self.close()?; PersistentBitVec::open(&path) } } - -// ── BitSlice / BitSliceMut impls ────────────────────────────────────────────── - -use crate::traits::{BitSlice, BitSliceMut}; - -impl BitSlice for PersistentBitVec { - fn len(&self) -> usize { self.n } - fn words(&self) -> &[u64] { self.data_words() } -} - -impl BitSlice for PersistentBitVecBuilder { - fn len(&self) -> usize { self.n } - fn words(&self) -> &[u64] { self.data_words() } -} - -impl BitSliceMut for PersistentBitVecBuilder { - fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() } -} diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index 271b5d8..266b3c1 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -7,53 +7,26 @@ use memmap2::MmapMut; use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry}; use crate::reader::PersistentCompactIntVec; +use crate::views::{BitSliceView, IntSliceView}; pub struct PersistentCompactIntVecBuilder { - path: PathBuf, - mmap: MmapMut, - n: usize, + path: PathBuf, + mmap: MmapMut, + n: usize, overflow: HashMap, } impl PersistentCompactIntVecBuilder { - /// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately. pub fn new(n: usize, path: &Path) -> io::Result { - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) - .open(path)?; - file.set_len((HEADER_SIZE + n) as u64)?; - let mmap = unsafe { MmapMut::map_mut(&file)? }; - Ok(Self { - path: path.to_path_buf(), - mmap, - n, - overflow: HashMap::new(), - }) - } - - /// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap. - /// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead. - pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result { - let n = src.len(); let file = OpenOptions::new() .read(true).write(true).create(true).truncate(true) .open(path)?; file.set_len((HEADER_SIZE + n) as u64)?; - let mut mmap = unsafe { MmapMut::map_mut(&file)? }; - mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes()); - Ok(Self { - path: path.to_path_buf(), - mmap, - n, - overflow: src.overflow_map().clone(), - }) + let mmap = unsafe { MmapMut::map_mut(&file)? }; + Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() }) } - /// Create from raw primary bytes + an already-built overflow map (no per-slot overhead). - pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap, path: &Path) -> io::Result { + pub fn from_raw_primary(primary: &[u8], overflow: HashMap, path: &Path) -> io::Result { let n = primary.len(); let file = OpenOptions::new() .read(true).write(true).create(true).truncate(true) @@ -64,40 +37,25 @@ impl PersistentCompactIntVecBuilder { Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) } - /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM. - /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow). pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result { fs::copy(source.path(), path)?; - let file = OpenOptions::new().read(true).write(true).open(path)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - - let n = source.len(); + let n = source.len(); let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; let data_offset = HEADER_SIZE + n; - let mut overflow = HashMap::with_capacity(n_overflow); for i in 0..n_overflow { let (slot, value) = parse_overflow_entry(&mmap, data_offset, i); overflow.insert(slot, value); } - - Ok(Self { - path: path.to_path_buf(), - mmap, - n, - overflow, - }) + Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) } - /// Get the value at the given slot, handling overflow if necessary. pub fn get(&self, slot: usize) -> u32 { match self.mmap[HEADER_SIZE + slot] { - 255 => *self - .overflow - .get(&slot) - .expect("sentinel without overflow entry"), - v => v as u32, + 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), + v => v as u32, } } @@ -111,15 +69,189 @@ impl PersistentCompactIntVecBuilder { } } - pub fn len(&self) -> usize { - self.n + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } + pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } + pub fn clear_overflow(&mut self) { self.overflow.clear(); } + + pub fn sum(&self) -> u64 { + byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied()) + } + pub fn count_nonzero(&self) -> u64 { + byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n]) } - pub fn is_empty(&self) -> bool { - self.n == 0 + pub fn view(&self) -> IntSliceView<'_> { + // Builder overflow is a HashMap, not sorted raw bytes — convert on the fly + // by collecting into a sorted vec and storing in a thread-local buffer. + // For read-back during building, just call get(slot) directly. + // view() is primarily useful AFTER freeze (on PersistentCompactIntVec). + // Here we expose it via a zero-alloc path: primary only, no overflow raw. + // Callers that need overflow_entries during building use overflow_entries(). + let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n]; + IntSliceView::new(primary, &[], 0, self.n) + } + + pub fn overflow_entries(&self) -> impl Iterator + '_ { + self.overflow.iter().map(|(&k, &v)| (k, v)) + } + + pub fn inc(&mut self, slot: usize) { + let v = self.get(slot); + self.set(slot, v.saturating_add(1)); + } + + // ── Computation methods ─────────────────────────────────────────────────── + + /// Increment one counter per 1-bit of `col`. Safe for any group size. + pub fn inc_present(&mut self, col: BitSliceView<'_>) { + let n = self.n; + for (wi, &word) in col.words().iter().enumerate() { + if word == 0 { continue; } + let mut w = word; + while w != 0 { + let bit = w.trailing_zeros() as usize; + let slot = wi * 64 + bit; + if slot < n { self.inc(slot); } + w &= w - 1; + } + } + } + + /// Increment one counter per 1-bit of `col`, using raw u8 arithmetic. + /// Caller guarantees no counter will reach 255 (group size < 255). + pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) { + { + let primary = self.primary_bytes_mut(); + let n = primary.len(); + for (wi, &word) in col.words().iter().enumerate() { + if word == 0 { continue; } + let mut w = word; + while w != 0 { + let bit = w.trailing_zeros() as usize; + let s = wi * 64 + bit; + if s < n { primary[s] += 1; } + w &= w - 1; + } + } + } + debug_assert!( + !self.primary_bytes().contains(&255), + "sentinel 255 reached in inc_present_fast — group size must be < 255" + ); + } + + /// Two-pass: primary bytes then overflow. Increments `self[slot]` for each + /// slot where `pred(col[slot])` is true. Safe for any group size. + pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + let n = col.len(); + for slot in 0..n { + let b = col.primary_bytes()[slot]; + if b < 255 && pred(b as u32) { + self.inc(slot); + } + } + for (slot, val) in col.overflow_entries() { + if pred(val) { self.inc(slot); } + } + } + + /// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255. + pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + let n = col.len(); + { + let primary = self.primary_bytes_mut(); + for slot in 0..n { + let b = col.primary_bytes()[slot]; + if b < 255 && pred(b as u32) { + primary[slot] += 1; + } + } + } + for (slot, val) in col.overflow_entries() { + if pred(val) { self.primary_bytes_mut()[slot] += 1; } + } + debug_assert!( + !self.primary_bytes().contains(&255), + "sentinel 255 reached in inc_predicate_fast — group size must be < 255" + ); + } + + pub fn add(&mut self, other: IntSliceView<'_>) { + let n = self.n; + for s in 0..n { + let sb = self.primary_bytes()[s]; + let ob = other.primary_bytes()[s]; + if sb < 255 && ob < 255 { + let sum = sb as u32 + ob as u32; + if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; } + else { self.set(s, sum); } + } else { + let sv = self.get(s); + let ov = other.get(s); + self.set(s, sv + ov); + } + } + } + + pub fn min(&mut self, other: IntSliceView<'_>) { + let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect(); + let other_ov: HashMap = other.overflow_entries().collect(); + self.clear_overflow(); + for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { + if b < *a { *a = b; } + } + for (slot, self_val) in self_ov { + if let Some(&other_val) = other_ov.get(&slot) { + self.set(slot, self_val.min(other_val)); + } + } + } + + pub fn max(&mut self, other: IntSliceView<'_>) { + for (slot, other_val) in other.overflow_entries() { + let sv = self.get(slot); + self.set(slot, sv.max(other_val)); + } + for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { + if b > *a { *a = b; } + } + } + + pub fn diff(&mut self, other: IntSliceView<'_>) { + let n = self.n; + for s in 0..n { + let sb = self.primary_bytes()[s]; + let ob = other.primary_bytes()[s]; + if sb < 255 { + self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }; + } else { + let sv = self.get(s); + let ov = if ob < 255 { ob as u32 } else { other.get(s) }; + self.set(s, sv.saturating_sub(ov)); + } + } + } + + pub fn mask_with(&mut self, mask: BitSliceView<'_>) { + let n = self.n; + for (wi, &word) in mask.words().iter().enumerate() { + if word == u64::MAX { continue; } + let mut zeros = !word; + while zeros != 0 { + let bit = zeros.trailing_zeros() as usize; + let s = wi * 64 + bit; + if s < n { + let b = self.primary_bytes()[s]; + if b != 0 { self.set(s, 0); } + } + zeros &= zeros - 1; + } + } } - /// Flush the primary mmap, then write sorted overflow data + index and fix the header. pub fn close(self) -> io::Result<()> { self.mmap.flush()?; let Self { path, mmap, n, overflow } = self; @@ -129,35 +261,9 @@ impl PersistentCompactIntVecBuilder { finalize_pciv(&path, n, &entries) } - /// Close and reopen as a read-only [`PersistentCompactIntVec`]. pub fn finish(self) -> io::Result { let path = self.path.clone(); self.close()?; PersistentCompactIntVec::open(&path) } } - -// ── IntSlice / IntSliceMut impls ────────────────────────────────────────────── - -use crate::traits::{IntSlice, IntSliceMut}; - -impl IntSlice for PersistentCompactIntVecBuilder { - fn len(&self) -> usize { self.n } - fn get(&self, slot: usize) -> u32 { self.get(slot) } - fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } - fn overflow_entries(&self) -> impl Iterator + '_ { - self.overflow.iter().map(|(&k, &v)| (k, v)) - } - fn sum(&self) -> u64 { - byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied()) - } - fn count_nonzero(&self) -> u64 { - byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n]) - } -} - -impl IntSliceMut for PersistentCompactIntVecBuilder { - fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); } - fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } - fn clear_overflow(&mut self) { self.overflow.clear(); } -} diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs index 9fe1659..c238a62 100644 --- a/src/obicompactvec/src/colgroup.rs +++ b/src/obicompactvec/src/colgroup.rs @@ -1,9 +1,7 @@ use std::io; -use crate::memoryvec::MemoryBitVec; use crate::tempbitvec::TempBitVec; use crate::tempintvec::TempCompactIntVec; -use crate::traits::BitSlice; // ── ColGroup ────────────────────────────────────────────────────────────────── @@ -41,22 +39,3 @@ pub trait MatrixGroupOps { /// Per-slot OR: true if any group column has value ≥ `threshold`. fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result; } - -// ── Internal helper ─────────────────────────────────────────────────────────── - -/// Iterate 1-bits of a `MemoryBitVec` and increment the corresponding raw -/// byte. Caller must guarantee that no counter will reach 255 (group size -/// < 255 columns), so that incrementing `u8` is safe and no sentinel is -/// accidentally written. -pub(crate) fn inc_primary_bits(primary: &mut [u8], mask: &MemoryBitVec) { - let n = primary.len(); - for (wi, &word) in mask.words().iter().enumerate() { - let mut w = word; - while w != 0 { - let bit = w.trailing_zeros() as usize; - let s = wi * 64 + bit; - if s < n { primary[s] += 1; } - w &= w - 1; - } - } -} diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index fc64c48..f3486d6 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,5 +1,3 @@ -use std::cmp::Ordering; -use std::collections::HashMap; use std::fs::{self, File}; use std::io::{self, BufWriter, Write as _}; use std::path::{Path, PathBuf}; @@ -10,14 +8,13 @@ use rayon::prelude::*; use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix}; use crate::builder::PersistentCompactIntVecBuilder; -use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits}; -use crate::memoryintvec::MemoryIntVec; -use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; -use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; -use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; +use crate::colgroup::{ColGroup, MatrixGroupOps}; +use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; -use crate::traits::{BitSliceMut, IntSlice, IntSliceMut}; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; +use crate::views::IntSliceView; fn col_path(dir: &Path, col: usize) -> PathBuf { dir.join(format!("col_{col:06}.pciv")) @@ -48,9 +45,7 @@ impl ColumnarCompactIntMatrix { } pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) { - for (c, col) in self.cols.iter().enumerate() { - buf[c] = col.get(slot); - } + for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); } } pub(crate) fn sum(&self) -> Array1 { @@ -72,31 +67,22 @@ impl ColumnarCompactIntMatrix { pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j))) } - pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j))) } - - pub(crate) fn partial_threshold_jaccard_dist_matrix( - &self, threshold: u32, - ) -> (Array2, Array2) { - pairwise2_matrix(self.n_cols(), |i, j| { - self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold) - }) + pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2, Array2) { + pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)) } - pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } - pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } - pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) @@ -111,7 +97,6 @@ impl ColumnarCompactIntMatrix { meta.n_cols += 1; meta.save(dir) } - } // ── PackedCompactIntMatrix ──────────────────────────────────────────────────── @@ -119,153 +104,12 @@ impl ColumnarCompactIntMatrix { const PCMX_MAGIC: [u8; 4] = *b"PCMX"; const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8) -/// Per-column metadata pre-parsed from the embedded PCIV header. struct ColInfo { - primary_start: usize, // absolute mmap offset to primary array - data_offset: usize, // absolute mmap offset to overflow array + primary_start: usize, + data_offset: usize, n_overflow: usize, - step: usize, - index: Vec<(usize, usize)>, } -// ── PackedIntCol — lightweight column view backed by the shared mmap ────────── - -pub(crate) struct PackedIntCol<'a> { - primary: &'a [u8], - overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE - n_overflow: usize, - step: usize, - index: &'a [(usize, usize)], - n: usize, -} - -impl PackedIntCol<'_> { - fn overflow_get(&self, slot: usize) -> u32 { - let (pos_start, pos_end) = if self.step == 0 { - (0, self.n_overflow) - } else { - let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); - let start = self.index[i].1; - let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow }; - (start, end) - }; - let mut lo = pos_start; - let mut hi = pos_end; - while lo < hi { - let mid = lo + (hi - lo) / 2; - let (stored, val) = parse_overflow_entry(self.overflow, 0, mid); - match stored.cmp(&slot) { - Ordering::Equal => return val, - Ordering::Less => lo = mid + 1, - Ordering::Greater => hi = mid, - } - } - panic!("slot {slot} marked overflow but not found") - } -} - -impl IntSlice for PackedIntCol<'_> { - fn len(&self) -> usize { self.n } - - fn get(&self, slot: usize) -> u32 { - let v = self.primary[slot]; - if v < 255 { v as u32 } else { self.overflow_get(slot) } - } - - fn primary_bytes(&self) -> &[u8] { self.primary } - - fn overflow_entries(&self) -> impl Iterator + '_ { - (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i)) - } - - fn iter(&self) -> impl Iterator + '_ { - PackedIntColIter { - primary: self.primary, - overflow: self.overflow, - slot: 0, - overflow_pos: 0, - n: self.n, - } - } - - fn sum(&self) -> u64 { - byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1)) - } - - fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) } -} - -struct PackedIntColIter<'a> { - primary: &'a [u8], - overflow: &'a [u8], - slot: usize, - overflow_pos: usize, - n: usize, -} - -impl Iterator for PackedIntColIter<'_> { - type Item = u32; - - fn next(&mut self) -> Option { - if self.slot >= self.n { return None; } - let v = self.primary[self.slot]; - self.slot += 1; - if v < 255 { - Some(v as u32) - } else { - let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos); - self.overflow_pos += 1; - Some(val) - } - } - - fn size_hint(&self) -> (usize, Option) { - let rem = self.n - self.slot; - (rem, Some(rem)) - } -} - -impl ExactSizeIterator for PackedIntColIter<'_> {} - -// ── IntColView — uniform column access across Columnar and Packed ───────────── - -enum IntColViewInner<'a> { - Columnar(&'a PersistentCompactIntVec), - Packed(PackedIntCol<'a>), -} - -/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`]. -/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats. -pub struct IntColView<'a>(IntColViewInner<'a>); - -impl IntSlice for IntColView<'_> { - fn len(&self) -> usize { - match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() } - } - fn get(&self, slot: usize) -> u32 { - match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) } - } - fn primary_bytes(&self) -> &[u8] { - match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() } - } - fn overflow_entries(&self) -> impl Iterator + '_ { - // Box implements Iterator, satisfying RPITIT across two distinct types. - let it: Box + '_> = match &self.0 { - IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()), - IntColViewInner::Packed(c) => Box::new(c.overflow_entries()), - }; - it - } - fn sum(&self) -> u64 { - match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() } - } - fn count_nonzero(&self) -> u64 { - match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() } - } -} - -// ───────────────────────────────────────────────────────────────────────────── - pub struct PackedCompactIntMatrix { mmap: Mmap, n_rows: usize, @@ -289,52 +133,30 @@ impl PackedCompactIntMatrix { for c in 0..n_cols { let off_pos = PCMX_HEADER + c * 8; let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize; - // Parse embedded PCIV header at col_base let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize; - let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize; let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize; - let primary_start = col_base + HEADER_SIZE; let data_offset = primary_start + n_pciv; - let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE; - - let mut index = Vec::with_capacity(n_idx); - for i in 0..n_idx { - index.push(parse_index_entry(&mmap, index_offset, i)); - } - columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index }); + columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov }); } - Ok(Self { mmap, n_rows, n_cols, columns }) } - pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> { + pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> { let ci = &self.columns[c]; - PackedIntCol { - primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows], - overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE], - n_overflow: ci.n_overflow, - step: ci.step, - index: &ci.index, - n: self.n_rows, - } + let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; + let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE]; + IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows) } pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { - let col = self.col_slice(c); - let overflow: HashMap = col.overflow_entries().collect(); - PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path) - } - - pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec { - MemoryIntVec::from(&self.col_slice(c)) + let view = self.col_view(c); + let overflow: std::collections::HashMap = view.overflow_entries().collect(); + PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path) } #[inline] - pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { - self.col_slice(col).get(slot) - } + pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) } pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) { for c in 0..self.n_cols { buf[c] = self.get(c, slot); } @@ -346,86 +168,61 @@ impl PackedCompactIntMatrix { pub(crate) fn sum(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| self.col_slice(c).sum()) - .collect() + (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect() ) } pub(crate) fn count_nonzero(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| self.col_slice(c).count_nonzero()) - .collect() + (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect() ) } - // ── Pair primitives — sequential scan via col_slice().iter() ───────────── - fn pair_partial_bray(&self, i: usize, j: usize) -> u64 { - self.col_slice(i).iter().zip(self.col_slice(j).iter()) - .map(|(a, b)| a.min(b) as u64) - .sum() + self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum() } - fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 { - self.col_slice(i).iter().zip(self.col_slice(j).iter()) - .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) - .sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum() } - fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) { - self.col_slice(i).iter().zip(self.col_slice(j).iter()) + self.col_view(i).iter().zip(self.col_view(j).iter()) .fold((0u64, 0u64), |(inter, uni), (a, b)| { - let ap = a >= t; - let bp = b >= t; + let ap = a >= t; let bp = b >= t; (inter + (ap & bp) as u64, uni + (ap | bp) as u64) }) } - fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - self.col_slice(i).iter().zip(self.col_slice(j).iter()) - .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)) - .sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum() } - fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - self.col_slice(i).iter().zip(self.col_slice(j).iter()) - .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }) - .sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum() } - fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - self.col_slice(i).iter().zip(self.col_slice(j).iter()) - .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }) - .sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum() } - // ── Matrix methods ──────────────────────────────────────────────────────── - pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j)) } - pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j)) } - pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2, Array2) { pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t)) } - pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) } @@ -435,32 +232,21 @@ impl PackedCompactIntMatrix { pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pcmx"); if packed_path.exists() { - // Matrix complete; remove any leftover column files from a killed cleanup. if let Ok(meta) = MatrixMeta::load(dir) { for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } let _ = fs::remove_file(dir.join("meta.json")); } return Ok(()); } - - let meta = MatrixMeta::load(dir)?; + let meta = MatrixMeta::load(dir)?; let n_cols = meta.n_cols; - - // Compute offsets from file sizes — no column data loaded into RAM. let col_sizes: Vec = (0..n_cols) .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len())) .collect::>()?; - let header_size = (PCMX_HEADER + n_cols * 8) as u64; let mut col_offset = header_size; let mut offsets = Vec::with_capacity(n_cols); - for &size in &col_sizes { - offsets.push(col_offset); - col_offset += size; - } - - // Write to a temp file; rename atomically so a killed process never leaves - // a truncated matrix.pcmx that would be mistaken for a complete file. + for &size in &col_sizes { offsets.push(col_offset); col_offset += size; } let tmp_path = dir.join("matrix.pcmx.tmp"); let mut out = BufWriter::new(File::create(&tmp_path)?); out.write_all(&PCMX_MAGIC)?; @@ -468,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { out.write_all(&(meta.n as u64).to_le_bytes())?; out.write_all(&(n_cols as u64).to_le_bytes())?; for &off in &offsets { out.write_all(&off.to_le_bytes())?; } - for c in 0..n_cols { - io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; - } + for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; } out.flush()?; drop(out); fs::rename(&tmp_path, &packed_path)?; - for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; } fs::remove_file(dir.join("meta.json"))?; Ok(()) @@ -488,18 +271,14 @@ pub enum PersistentCompactIntMatrix { } impl PersistentCompactIntMatrix { - /// Open from `layer_dir`, auto-detecting Packed or Columnar. pub fn open(layer_dir: &Path) -> io::Result { let counts_dir = layer_dir.join("counts"); - if counts_dir.join("matrix.pcmx").exists() { return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?)); } - if MatrixMeta::load(&counts_dir).is_ok() { return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?)); } - Err(io::Error::new( io::ErrorKind::NotFound, format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()), @@ -509,7 +288,6 @@ impl PersistentCompactIntMatrix { pub fn n(&self) -> usize { match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows } } - pub fn n_cols(&self) -> usize { match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols } } @@ -521,10 +299,10 @@ impl PersistentCompactIntMatrix { } } - pub fn col_view(&self, c: usize) -> IntColView<'_> { + pub fn col_view(&self, c: usize) -> IntSliceView<'_> { match self { - Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))), - Self::Packed(m) => IntColView(IntColViewInner::Packed(m.col_slice(c))), + Self::Columnar(m) => m.col(c).view(), + Self::Packed(m) => m.col_view(c), } } @@ -535,29 +313,18 @@ impl PersistentCompactIntMatrix { } } - pub fn col_as_memory(&self, c: usize) -> MemoryIntVec { - match self { - Self::Columnar(m) => MemoryIntVec::from(m.col(c)), - Self::Packed(m) => m.col_as_memory(c), - } - } - pub fn row(&self, slot: usize) -> Box<[u32]> { match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) } } - pub fn fill_row(&self, slot: usize, buf: &mut [u32]) { match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) } } - pub fn sum(&self) -> Array1 { match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() } } - pub fn count_nonzero(&self) -> Array1 { match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() } } - pub fn partial_bray_dist_matrix(&self) -> Array2 { match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() } } @@ -576,7 +343,6 @@ impl PersistentCompactIntMatrix { pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) } } - pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> { ColumnarCompactIntMatrix::append_column(dir, value_of) } @@ -592,12 +358,12 @@ impl ColumnWeights for PersistentCompactIntMatrix { } impl CountPartials for PersistentCompactIntMatrix { - fn partial_bray(&self) -> Array2 { self.partial_bray_dist_matrix() } - fn partial_euclidean(&self) -> Array2 { self.partial_euclidean_dist_matrix() } + fn partial_bray(&self) -> Array2 { self.partial_bray_dist_matrix() } + fn partial_euclidean(&self) -> Array2 { self.partial_euclidean_dist_matrix() } fn partial_threshold_jaccard(&self, t: u32) -> (Array2, Array2) { self.partial_threshold_jaccard_dist_matrix(t) } - fn partial_relfreq_bray(&self, g: &Array1) -> Array2 { self.partial_relfreq_bray_dist_matrix(g) } - fn partial_relfreq_euclidean(&self, g: &Array1) -> Array2 { self.partial_relfreq_euclidean_dist_matrix(g) } - fn partial_hellinger(&self, g: &Array1) -> Array2 { self.partial_hellinger_euclidean_dist_matrix(g) } + fn partial_relfreq_bray(&self, g: &Array1) -> Array2 { self.partial_relfreq_bray_dist_matrix(g) } + fn partial_relfreq_euclidean(&self, g: &Array1) -> Array2 { self.partial_relfreq_euclidean_dist_matrix(g) } + fn partial_hellinger(&self, g: &Array1) -> Array2 { self.partial_hellinger_euclidean_dist_matrix(g) } } // ── Builder ─────────────────────────────────────────────────────────────────── @@ -613,16 +379,13 @@ impl PersistentCompactIntMatrixBuilder { fs::create_dir_all(dir)?; Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 }) } - pub fn n(&self) -> usize { self.n } pub fn n_cols(&self) -> usize { self.n_cols } - pub fn add_col(&mut self) -> io::Result { let path = col_path(&self.dir, self.n_cols); self.n_cols += 1; PersistentCompactIntVecBuilder::new(self.n, &path) } - pub fn close(self) -> io::Result<()> { MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) } @@ -634,30 +397,20 @@ impl MatrixGroupOps for PersistentCompactIntMatrix { fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result { let n = self.n(); if g.indices.len() < 255 { - // Fast path: counts fit in u8 — accumulate directly into raw bytes. let mut builder = TempCompactIntVecBuilder::new(n)?; - { - let primary = builder.primary_bytes_mut(); - for &c in &g.indices { - let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); - inc_primary_bits(primary, &mask); - } + for &c in &g.indices { + builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold); } builder.freeze() } else { - // Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks. let mut result = TempCompactIntVecBuilder::new(n)?; for chunk in g.indices.chunks(254) { - let mut chunk_builder = TempCompactIntVecBuilder::new(n)?; - { - let primary = chunk_builder.primary_bytes_mut(); - for &c in chunk { - let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); - inc_primary_bits(primary, &mask); - } + let mut chunk_b = TempCompactIntVecBuilder::new(n)?; + for &c in chunk { + chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold); } - let chunk_frozen = chunk_builder.freeze()?; - IntSliceMut::add(&mut result, &chunk_frozen); + let frozen = chunk_b.freeze()?; + result.add(frozen.view()); } result.freeze() } @@ -666,10 +419,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix { fn partial_group_sum(&self, g: &ColGroup) -> io::Result { let n = self.n(); let mut result = TempCompactIntVecBuilder::new(n)?; - for &c in &g.indices { - let view = self.col_view(c); - IntSliceMut::add(&mut result, &view); - } + for &c in &g.indices { result.add(self.col_view(c)); } result.freeze() } @@ -677,8 +427,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix { let n = self.n(); let mut result = TempBitVecBuilder::new(n)?; for &c in &g.indices { - let mask = self.col_view(c).cmp_scalar(|v| v >= threshold); - result.or(&mask); + result.or_where(self.col_view(c), |v| v >= threshold); } result.freeze() } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index 6625ab6..ddd3bdc 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -5,26 +5,24 @@ mod colgroup; mod format; mod intmatrix; mod layer_meta; -mod memoryintvec; -mod memoryvec; mod meta; mod reader; mod tempbitvec; mod tempintvec; +mod views; pub mod traits; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; -pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix}; +pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix}; pub use builder::PersistentCompactIntVecBuilder; pub use colgroup::{ColGroup, MatrixGroupOps}; -pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; +pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; -pub use memoryintvec::{MemoryIntIter, MemoryIntVec}; -pub use memoryvec::MemoryBitVec; pub use reader::PersistentCompactIntVec; pub use tempbitvec::TempBitVec; pub use tempintvec::TempCompactIntVec; -pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit}; +pub use traits::{BitPartials, ColumnWeights, CountPartials}; +pub use views::{BitSliceView, IntSliceView}; #[cfg(test)] #[path = "tests/mod.rs"] diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs deleted file mode 100644 index d5ca280..0000000 --- a/src/obicompactvec/src/memoryintvec.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::collections::HashMap; -use std::io; -use std::ops::{Add, AddAssign, Sub, SubAssign}; -use std::path::Path; - -use crate::builder::PersistentCompactIntVecBuilder; -use crate::format::{byte_count_nonzero, byte_sum}; -use crate::traits::{IntSlice, IntSliceMut}; - -// ── MemoryIntVec ────────────────────────────────────────────────────────────── - -#[derive(Clone)] -pub struct MemoryIntVec { - primary: Vec, - overflow: HashMap, - n: usize, -} - -impl MemoryIntVec { - pub fn new(n: usize) -> Self { - Self { primary: vec![0u8; n], overflow: HashMap::new(), n } - } - - pub fn len(&self) -> usize { self.n } - pub fn is_empty(&self) -> bool { self.n == 0 } - - /// Construct directly from a pre-built primary array (no overflow — all values < 255). - pub(crate) fn from_primary(primary: Vec) -> Self { - let n = primary.len(); - Self { primary, overflow: HashMap::new(), n } - } - - pub(crate) fn from_primary_and_overflow(primary: Vec, overflow: HashMap) -> Self { - let n = primary.len(); - Self { primary, overflow, n } - } - - pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary } - pub(crate) fn overflow_map(&self) -> &HashMap { &self.overflow } - - pub fn get(&self, slot: usize) -> u32 { - match self.primary[slot] { - 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), - v => v as u32, - } - } - - pub fn sum(&self) -> u64 { - byte_sum(&self.primary, self.overflow.values().copied()) - } - - pub fn count_nonzero(&self) -> u64 { - byte_count_nonzero(&self.primary) - } - - pub fn filled(n: usize, value: u32) -> Self { - if value < 255 { - Self { primary: vec![value as u8; n], overflow: HashMap::new(), n } - } else { - Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n } - } - } - - pub fn iter(&self) -> MemoryIntIter<'_> { - MemoryIntIter { vec: self, slot: 0 } - } - - /// Write to disk and return a writable builder at `path`. - pub fn persist(&self, path: &Path) -> io::Result { - PersistentCompactIntVecBuilder::from_memory(self, path) - } -} - -// ── IntSlice / IntSliceMut ──────────────────────────────────────────────────── - -impl IntSlice for MemoryIntVec { - fn len(&self) -> usize { self.n } - fn get(&self, slot: usize) -> u32 { self.get(slot) } - fn primary_bytes(&self) -> &[u8] { &self.primary } - fn overflow_entries(&self) -> impl Iterator + '_ { - self.overflow.iter().map(|(&k, &v)| (k, v)) - } - fn iter(&self) -> impl Iterator + '_ { self.iter() } - fn sum(&self) -> u64 { self.sum() } - fn count_nonzero(&self) -> u64 { self.count_nonzero() } -} - -impl IntSliceMut for MemoryIntVec { - fn set(&mut self, slot: usize, value: u32) { - if value < 255 { - self.primary[slot] = value as u8; - self.overflow.remove(&slot); - } else { - self.primary[slot] = 255; - self.overflow.insert(slot, value); - } - } - fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.primary } - fn clear_overflow(&mut self) { self.overflow.clear(); } -} - -// ── From conversions ────────────────────────────────────────────────────────── - -impl MemoryIntVec { - /// Bulk copy from another `MemoryIntVec`: memcpy for the primary bytes, - /// clone for the overflow map. - pub fn copy_from_memory(&mut self, src: &MemoryIntVec) { - assert_eq!(self.n, src.n, "MemoryIntVec length mismatch"); - self.primary.copy_from_slice(&src.primary); - self.overflow = src.overflow.clone(); - } -} - -impl From<&S> for MemoryIntVec { - fn from(src: &S) -> Self { - Self::from_primary_and_overflow( - src.primary_bytes().to_vec(), - src.overflow_entries().collect(), - ) - } -} - -// ── std::ops — owned (consumes lhs) ────────────────────────────────────────── - -impl Add<&B> for MemoryIntVec { - type Output = MemoryIntVec; - fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self } -} - -impl Sub<&B> for MemoryIntVec { - type Output = MemoryIntVec; - fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self } -} - -// ── std::ops — borrowed (clones lhs) ───────────────────────────────────────── - -impl Add<&B> for &MemoryIntVec { - type Output = MemoryIntVec; - fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) } -} - -impl Sub<&B> for &MemoryIntVec { - type Output = MemoryIntVec; - fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) } -} - -// ── std::ops — in-place assign ──────────────────────────────────────────────── - -impl AddAssign<&B> for MemoryIntVec { - fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); } -} - -impl SubAssign<&B> for MemoryIntVec { - fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); } -} - -// ── Iterator ────────────────────────────────────────────────────────────────── - -pub struct MemoryIntIter<'a> { - vec: &'a MemoryIntVec, - slot: usize, -} - -impl Iterator for MemoryIntIter<'_> { - type Item = u32; - - fn next(&mut self) -> Option { - if self.slot >= self.vec.n { return None; } - let v = self.vec.get(self.slot); - self.slot += 1; - Some(v) - } - - fn size_hint(&self) -> (usize, Option) { - let rem = self.vec.n - self.slot; - (rem, Some(rem)) - } -} - -impl ExactSizeIterator for MemoryIntIter<'_> {} - -impl<'a> IntoIterator for &'a MemoryIntVec { - type Item = u32; - type IntoIter = MemoryIntIter<'a>; - fn into_iter(self) -> MemoryIntIter<'a> { self.iter() } -} diff --git a/src/obicompactvec/src/memoryvec.rs b/src/obicompactvec/src/memoryvec.rs deleted file mode 100644 index fef0960..0000000 --- a/src/obicompactvec/src/memoryvec.rs +++ /dev/null @@ -1,138 +0,0 @@ -use std::io; -use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not}; -use std::path::Path; - -use crate::bitvec::{BitIter, PersistentBitVecBuilder, n_words}; -use crate::traits::{BitSlice, BitSliceMut}; - -// ── MemoryBitVec ────────────────────────────────────────────────────────────── - -#[derive(Clone)] -pub struct MemoryBitVec { - words: Vec, - n: usize, -} - -impl MemoryBitVec { - pub fn new(n: usize) -> Self { - Self { words: vec![0u64; n_words(n)], n } - } - - pub fn ones(n: usize) -> Self { - let rem = n % 64; - let mut words = vec![u64::MAX; n_words(n)]; - if rem != 0 { - if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; } - } - Self { words, n } - } - - pub(crate) fn from_words(words: Vec, n: usize) -> Self { - Self { words, n } - } - - pub fn len(&self) -> usize { self.n } - pub fn is_empty(&self) -> bool { self.n == 0 } - - pub fn get(&self, slot: usize) -> bool { - (self.words[slot >> 6] >> (slot & 63)) & 1 != 0 - } - - /// Write to disk and return a writable builder positioned at the same path. - pub fn persist(&self, path: &Path) -> io::Result { - let mut b = PersistentBitVecBuilder::new(self.n, path)?; - b.copy_from(self); - Ok(b) - } -} - -// ── BitSlice / BitSliceMut ──────────────────────────────────────────────────── - -impl BitSlice for MemoryBitVec { - fn len(&self) -> usize { self.n } - fn words(&self) -> &[u64] { &self.words } -} - -impl BitSliceMut for MemoryBitVec { - fn words_mut(&mut self) -> &mut [u64] { &mut self.words } -} - -// ── From conversions ────────────────────────────────────────────────────────── - -impl From<&S> for MemoryBitVec { - fn from(src: &S) -> Self { - Self { words: src.words().to_vec(), n: src.len() } - } -} - -// ── std::ops — owned (consumes lhs) ────────────────────────────────────────── - -impl BitAnd<&B> for MemoryBitVec { - type Output = MemoryBitVec; - fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self } -} - -impl BitOr<&B> for MemoryBitVec { - type Output = MemoryBitVec; - fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self } -} - -impl BitXor<&B> for MemoryBitVec { - type Output = MemoryBitVec; - fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self } -} - -impl Not for MemoryBitVec { - type Output = MemoryBitVec; - fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self } -} - -// ── std::ops — borrowed (clones lhs) ───────────────────────────────────────── - -impl BitAnd<&B> for &MemoryBitVec { - type Output = MemoryBitVec; - fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) } -} - -impl BitOr<&B> for &MemoryBitVec { - type Output = MemoryBitVec; - fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) } -} - -impl BitXor<&B> for &MemoryBitVec { - type Output = MemoryBitVec; - fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) } -} - -impl Not for &MemoryBitVec { - type Output = MemoryBitVec; - fn not(self) -> MemoryBitVec { !self.clone() } -} - -// ── std::ops — in-place assign ──────────────────────────────────────────────── - -impl BitAndAssign<&B> for MemoryBitVec { - fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); } -} - -impl BitOrAssign<&B> for MemoryBitVec { - fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); } -} - -impl BitXorAssign<&B> for MemoryBitVec { - fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); } -} - -// ── Iterator ────────────────────────────────────────────────────────────────── - -impl MemoryBitVec { - pub fn iter(&self) -> BitIter<'_> { - BitIter { words: &self.words, slot: 0, n: self.n } - } -} - -impl<'a> IntoIterator for &'a MemoryBitVec { - type Item = bool; - type IntoIter = BitIter<'a>; - fn into_iter(self) -> BitIter<'a> { self.iter() } -} diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index af7d05c..f3b1dd6 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -5,6 +5,7 @@ use std::path::{Path, PathBuf}; use memmap2::Mmap; use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry}; +use crate::views::IntSliceView; pub struct PersistentCompactIntVec { mmap: Mmap, @@ -18,97 +19,60 @@ pub struct PersistentCompactIntVec { } impl PersistentCompactIntVec { - /// Opens a persistent compact int vector from the given path. pub fn open(path: &Path) -> io::Result { let mmap = unsafe { Mmap::map(&File::open(path)?)? }; if mmap.len() < HEADER_SIZE { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "PCIV file too short", - )); + return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short")); } if &mmap[0..4] != &MAGIC { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); } - let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; - let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; + let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; + let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; let primary_offset = HEADER_SIZE; - let data_offset = primary_offset + n; - let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; + let data_offset = primary_offset + n; + let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; let mut index = Vec::with_capacity(n_index); for i in 0..n_index { index.push(parse_index_entry(&mmap, index_offset, i)); } - Ok(Self { - mmap, - n, - n_overflow, - step, - index, - primary_offset, - data_offset, - path: path.to_path_buf(), - }) + Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() }) } - /// Returns the path of the compact int vector file. - pub fn path(&self) -> &Path { - &self.path - } + pub fn path(&self) -> &Path { &self.path } + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } - /// Returns the length of the compact int vector. - pub fn len(&self) -> usize { - self.n - } - - /// Returns whether the compact int vector is empty. - pub fn is_empty(&self) -> bool { - self.n == 0 - } - - /// Returns the value at the given slot. pub fn get(&self, slot: usize) -> u32 { match self.mmap[self.primary_offset + slot] { 255 => self.overflow_get(slot), - v => v as u32, + v => v as u32, } } - /// Returns the value at the given slot from the overflow region. fn overflow_get(&self, slot: usize) -> u32 { - let pos_start; - let pos_end; - - if self.step == 0 { - pos_start = 0; - pos_end = self.n_overflow; + let (pos_start, pos_end) = if self.step == 0 { + (0, self.n_overflow) } else { - let i = self - .index - .partition_point(|&(s, _)| s <= slot) - .saturating_sub(1); - pos_start = self.index[i].1; - pos_end = if i + 1 < self.index.len() { - self.index[i + 1].1 - } else { - self.n_overflow - }; - } - + let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); + let start = self.index[i].1; + let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow }; + (start, end) + }; let mut lo = pos_start; let mut hi = pos_end; while lo < hi { let mid = lo + (hi - lo) / 2; match self.data_slot(mid).cmp(&slot) { - std::cmp::Ordering::Equal => return self.data_value(mid), - std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Equal => return self.data_value(mid), + std::cmp::Ordering::Less => lo = mid + 1, std::cmp::Ordering::Greater => hi = mid, } } @@ -116,14 +80,12 @@ impl PersistentCompactIntVec { } #[inline] - /// Returns the slot at the given index in the overflow region. fn data_slot(&self, i: usize) -> usize { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE; u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize } #[inline] - /// Returns the value at the given index in the overflow region. fn data_value(&self, i: usize) -> u32 { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8; u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) @@ -139,121 +101,70 @@ impl PersistentCompactIntVec { byte_count_nonzero(primary) } - #[inline] - /// Returns the Bray-Curtis distance between two compact int vectors. + /// Lightweight zero-copy view — primary and overflow point into the mmap. + pub fn view(&self) -> IntSliceView<'_> { + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE]; + IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n) + } + + pub fn iter(&self) -> Iter<'_> { + Iter { pciv: self, slot: 0, overflow_pos: 0 } + } + + // ── Distance methods ────────────────────────────────────────────────────── + pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { let sum_min = self.partial_bray_dist(other); let denom = self.sum() + other.sum(); - if denom == 0 { - return 0.0; - } - 1.0 - 2.0 * sum_min as f64 / denom as f64 + if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 } } - /// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis. - /// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`. pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) - .map(|(a, b)| a.min(b) as u64) - .sum() + self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum() } - /// Returns the relative frequency Bray-Curtis distance between two compact int vectors. - /// - /// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b); - 1.0 - sum_min + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + 1.0 - self.partial_relfreq_bray_dist(other, sa, sb) } - /// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors. - /// - /// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency - /// Bray-Curtis distance over a set of vector pairs. - /// - /// Arguments: - /// - `other`: the other compact int vector to compare with - /// - `sum_a`: the sum of the first vector's counts - /// - `sum_b`: the sum of the second vector's counts - /// - /// Returns the sum of the minimum relative frequencies at each index. - pub fn partial_relfreq_bray_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - let sum_min: f64 = self - .iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; pa.min(pb) }) - .sum(); - sum_min + .sum() } - /// Returns the euclidean distance between two compact int vectors. pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { self.partial_euclidean_dist(other).sqrt() } - /// Returns the partial euclidean distance between two compact int vectors. - /// - /// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance - /// over a set of vector pairs. - /// - /// The result is the sum of the squared differences between corresponding elements of the two - /// vectors. pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) - .map(|(a, b)| { - let d = a as f64 - b as f64; - d * d - }) + self.iter().zip(other.iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) .sum() } - /// Returns the relative frequency euclidean distance between two compact int vectors. - /// - /// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - self.partial_relfreq_euclidean_dist(other, sum_a, sum_b) - .sqrt() + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt() } - /// Returns the partial relative frequency euclidean distance between two compact int vectors. - /// - /// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency - /// euclidean distance over a set of vector pairs. - pub fn partial_relfreq_euclidean_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; @@ -263,46 +174,19 @@ impl PersistentCompactIntVec { .sum() } - /// Returns the Euclidean distance between two compact int vectors using the Hellinger transform. - /// - /// The Hellinger transform is applied to the raw counts of each vector, and the result is - /// the Euclidean distance between the transformed vectors. The Hellinger transform is defined - /// as the square root of the relative frequencies. pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - self.partial_hellinger_euclidean_dist(other, sum_a, sum_b) - .sqrt() + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt() } - /// Returns the partial Hellinger Euclidean distance between two compact int vectors. - /// - /// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger - /// Euclidean distance over a set of vector pairs. - pub fn partial_hellinger_euclidean_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { - let pa = if sum_a > 0.0 { - (a as f64 / sum_a).sqrt() - } else { - 0.0 - }; - let pb = if sum_b > 0.0 { - (b as f64 / sum_b).sqrt() - } else { - 0.0 - }; + let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 }; + let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 }; let d = pa - pb; d * d }) @@ -314,22 +198,13 @@ impl PersistentCompactIntVec { } pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold); - if union == 0 { - return 0.0; - } - 1.0 - intersection as f64 / union as f64 + if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 } } - pub fn partial_threshold_jaccard_dist( - &self, - other: &PersistentCompactIntVec, - threshold: u32, - ) -> (u64, u64) { + pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .fold((0u64, 0u64), |(inter, uni), (a, b)| { let ap = a >= threshold; let bp = b >= threshold; @@ -340,41 +215,12 @@ impl PersistentCompactIntVec { pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 { self.threshold_jaccard_dist(other, 1) } - - pub fn iter(&self) -> Iter<'_> { - Iter { - pciv: self, - slot: 0, - overflow_pos: 0, - } - } -} - -// ── IntSlice impl ───────────────────────────────────────────────────────────── - -use crate::traits::IntSlice; - -impl IntSlice for PersistentCompactIntVec { - fn len(&self) -> usize { self.n } - fn get(&self, slot: usize) -> u32 { self.get(slot) } - fn primary_bytes(&self) -> &[u8] { - &self.mmap[self.primary_offset..self.primary_offset + self.n] - } - fn overflow_entries(&self) -> impl Iterator + '_ { - (0..self.n_overflow).map(|i| (self.data_slot(i), self.data_value(i))) - } - fn iter(&self) -> impl Iterator + '_ { self.iter() } - fn sum(&self) -> u64 { self.sum() } - fn count_nonzero(&self) -> u64 { self.count_nonzero() } } impl<'a> IntoIterator for &'a PersistentCompactIntVec { type Item = u32; type IntoIter = Iter<'a>; - - fn into_iter(self) -> Iter<'a> { - self.iter() - } + fn into_iter(self) -> Iter<'a> { self.iter() } } pub struct Iter<'a> { @@ -389,9 +235,7 @@ impl Iterator for Iter<'_> { type Item = u32; fn next(&mut self) -> Option { - if self.slot >= self.pciv.n { - return None; - } + if self.slot >= self.pciv.n { return None; } let v = self.pciv.mmap[self.pciv.primary_offset + self.slot]; self.slot += 1; if v < 255 { diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs index 3945075..3024ffb 100644 --- a/src/obicompactvec/src/tempbitvec.rs +++ b/src/obicompactvec/src/tempbitvec.rs @@ -4,43 +4,48 @@ use std::path::Path; use tempfile::TempDir; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; -use crate::traits::{BitSlice, BitSliceMut}; +use crate::views::{BitSliceIter, BitSliceView, IntSliceView}; // ── TempBitVec — frozen read-only, auto-deleted on drop ────────────────────── -/// A bit vector backed by a temporary file. -/// Implements [`BitSlice`]; the file is deleted when this value is dropped. -/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file. pub struct TempBitVec { - vec: PersistentBitVec, + vec: PersistentBitVec, // Dropped after `vec` (field order), so the mmap is released before the // temp directory is deleted. _temp: TempDir, } impl TempBitVec { - /// Copy to a permanent file and open as a [`PersistentBitVec`]. pub fn make_persistent(&self, path: &Path) -> io::Result { std::fs::copy(self.vec.path(), path)?; PersistentBitVec::open(path) } - pub fn len(&self) -> usize { self.vec.len() } - pub fn is_empty(&self) -> bool { self.vec.is_empty() } -} - -impl BitSlice for TempBitVec { - fn len(&self) -> usize { self.vec.len() } - fn words(&self) -> &[u64] { self.vec.words() } + pub fn len(&self) -> usize { + self.vec.len() + } + pub fn is_empty(&self) -> bool { + self.vec.is_empty() + } + pub fn get(&self, slot: usize) -> bool { + self.vec.get(slot) + } + pub fn count_ones(&self) -> u64 { + self.vec.count_ones() + } + pub fn view(&self) -> BitSliceView<'_> { + self.vec.view() + } + pub fn iter(&self) -> BitSliceIter<'_> { + self.view().iter() + } } // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ──────────────── -/// Writable builder for a [`TempBitVec`]. `pub(crate)` — callers receive -/// only the frozen result via [`freeze`](Self::freeze). pub(crate) struct TempBitVecBuilder { builder: PersistentBitVecBuilder, - temp: TempDir, + temp: TempDir, } impl TempBitVecBuilder { @@ -51,19 +56,35 @@ impl TempBitVecBuilder { Ok(Self { builder, temp }) } - /// Finalize writes and return a frozen, read-only [`TempBitVec`]. pub(crate) fn freeze(self) -> io::Result { let Self { builder, temp } = self; let vec = builder.finish()?; Ok(TempBitVec { vec, _temp: temp }) } -} -impl BitSlice for TempBitVecBuilder { - fn len(&self) -> usize { self.builder.len() } - fn words(&self) -> &[u64] { self.builder.words() } -} + pub fn set(&mut self, slot: usize, value: bool) { + self.builder.set(slot, value); + } + pub(crate) fn view(&self) -> BitSliceView<'_> { + self.builder.view() + } -impl BitSliceMut for TempBitVecBuilder { - fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() } + pub fn or(&mut self, other: BitSliceView<'_>) { + self.builder.or(other); + } + + /// Set self[slot] where pred(col[slot]) is true. Two-pass: primary then overflow. + pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + for slot in 0..col.len() { + let b = col.primary_bytes()[slot]; + if b < 255 && pred(b as u32) { + self.builder.set(slot, true); + } + } + for (slot, val) in col.overflow_entries() { + if pred(val) { + self.builder.set(slot, true); + } + } + } } diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs index ced3cef..e5ff848 100644 --- a/src/obicompactvec/src/tempintvec.rs +++ b/src/obicompactvec/src/tempintvec.rs @@ -5,13 +5,10 @@ use tempfile::TempDir; use crate::builder::PersistentCompactIntVecBuilder; use crate::reader::PersistentCompactIntVec; -use crate::traits::{IntSlice, IntSliceMut}; +use crate::views::{BitSliceView, IntSliceView}; // ── TempCompactIntVec — frozen read-only, auto-deleted on drop ──────────────── -/// A compact int vector backed by a temporary file. -/// Implements [`IntSlice`]; the file is deleted when this value is dropped. -/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file. pub struct TempCompactIntVec { vec: PersistentCompactIntVec, // Dropped after `vec` (field order), so the mmap is released before the @@ -20,7 +17,6 @@ pub struct TempCompactIntVec { } impl TempCompactIntVec { - /// Copy to a permanent file and open as a [`PersistentCompactIntVec`]. pub fn make_persistent(&self, path: &Path) -> io::Result { std::fs::copy(self.vec.path(), path)?; PersistentCompactIntVec::open(path) @@ -28,23 +24,14 @@ impl TempCompactIntVec { pub fn len(&self) -> usize { self.vec.len() } pub fn is_empty(&self) -> bool { self.vec.is_empty() } -} - -impl IntSlice for TempCompactIntVec { - fn len(&self) -> usize { self.vec.len() } - fn get(&self, slot: usize) -> u32 { self.vec.get(slot) } - fn primary_bytes(&self) -> &[u8] { self.vec.primary_bytes() } - fn overflow_entries(&self) -> impl Iterator + '_ { - self.vec.overflow_entries() - } - fn sum(&self) -> u64 { self.vec.sum() } - fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() } + pub fn get(&self, slot: usize) -> u32 { self.vec.get(slot) } + pub fn sum(&self) -> u64 { self.vec.sum() } + pub fn view(&self) -> IntSliceView<'_> { self.vec.view() } + pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() } } // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ── -/// Writable builder for a [`TempCompactIntVec`]. `pub(crate)` — callers -/// receive only the frozen result via [`freeze`](Self::freeze). pub(crate) struct TempCompactIntVecBuilder { builder: PersistentCompactIntVecBuilder, temp: TempDir, @@ -58,25 +45,47 @@ impl TempCompactIntVecBuilder { Ok(Self { builder, temp }) } - /// Finalize writes and return a frozen, read-only [`TempCompactIntVec`]. pub(crate) fn freeze(self) -> io::Result { let Self { builder, temp } = self; let vec = builder.finish()?; Ok(TempCompactIntVec { vec, _temp: temp }) } -} -impl IntSlice for TempCompactIntVecBuilder { - fn len(&self) -> usize { self.builder.len() } - fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } - fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } - fn overflow_entries(&self) -> impl Iterator + '_ { - self.builder.overflow_entries() + // ── Delegation methods ──────────────────────────────────────────────────── + + pub(crate) fn n(&self) -> usize { self.builder.len() } + + pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } + pub(crate) fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } + + pub(crate) fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } + pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } + + pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) { + self.builder.inc_present(col); } -} -impl IntSliceMut for TempCompactIntVecBuilder { - fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } - fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } - fn clear_overflow(&mut self) { self.builder.clear_overflow(); } + pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) { + self.builder.inc_present_fast(col); + } + + pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.inc_predicate(col, pred); + } + + pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.inc_predicate_fast(col, pred); + } + + pub(crate) fn add(&mut self, other: IntSliceView<'_>) { + self.builder.add(other); + } + + pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) { + self.builder.mask_with(mask); + } + + pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); } + pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); } + pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); } } diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index 5d93222..7600ac3 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -1,7 +1,7 @@ use tempfile::tempdir; use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder}; -use crate::traits::{BitPartials, BitSlice, BitSliceMut}; +use crate::traits::BitPartials; fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { let n = cols.first().map_or(0, |c| c.len()); diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs index 7382e14..4669489 100644 --- a/src/obicompactvec/src/tests/bitvec.rs +++ b/src/obicompactvec/src/tests/bitvec.rs @@ -1,6 +1,5 @@ use tempfile::tempdir; -use crate::traits::{BitSlice, BitSliceMut}; use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder}; fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) { @@ -78,7 +77,7 @@ fn op_and() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.and(&rb); + b.and(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![true, false, false, false]); @@ -91,7 +90,7 @@ fn op_or() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.or(&rb); + b.or(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![true, true, true, false]); @@ -104,7 +103,7 @@ fn op_xor() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.xor(&rb); + b.xor(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![false, true, true, false]); diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs index 388508d..884450f 100644 --- a/src/obicompactvec/src/tests/colgroup.rs +++ b/src/obicompactvec/src/tests/colgroup.rs @@ -5,8 +5,7 @@ use crate::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; -use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut}; -use crate::{MemoryBitVec, MemoryIntVec}; +use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder}; // ── helpers ─────────────────────────────────────────────────────────────────── @@ -114,42 +113,52 @@ fn int_partial_group_any() { #[test] fn mask_with_zeros_selected_slots() { // count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0] - let mut v = MemoryIntVec::new(4); + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap(); v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40); - let mut mask = MemoryBitVec::new(4); + let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap(); mask.set(0, true); mask.set(2, true); - v.mask_with(&mask); - assert_eq!(v.get(0), 10); - assert_eq!(v.get(1), 0); - assert_eq!(v.get(2), 30); - assert_eq!(v.get(3), 0); + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 10); + assert_eq!(r.get(1), 0); + assert_eq!(r.get(2), 30); + assert_eq!(r.get(3), 0); } #[test] fn mask_with_overflow_slot_zeroed() { // overflow slot (value 500) masked out → removed from overflow, primary=0 - let mut v = MemoryIntVec::new(3); + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap(); v.set(0, 10); v.set(1, 500); v.set(2, 5); - let mut mask = MemoryBitVec::new(3); + let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap(); mask.set(0, true); mask.set(2, true); // slot 1 masked out - v.mask_with(&mask); - assert_eq!(v.get(0), 10); - assert_eq!(v.get(1), 0); - assert_eq!(v.get(2), 5); - let ov: Vec<_> = v.overflow_entries().collect(); + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 10); + assert_eq!(r.get(1), 0); + assert_eq!(r.get(2), 5); + let ov: Vec<_> = r.view().overflow_entries().collect(); assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone"); } #[test] fn mask_with_all_ones_is_noop() { - let mut v = MemoryIntVec::new(4); + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap(); v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42); - let mask = MemoryBitVec::ones(4); - v.mask_with(&mask); - assert_eq!(v.get(0), 300); - assert_eq!(v.get(1), 1); - assert_eq!(v.get(2), 0); - assert_eq!(v.get(3), 42); + let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap(); + mask.not(); // all bits → 1 + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 300); + assert_eq!(r.get(1), 1); + assert_eq!(r.get(2), 0); + assert_eq!(r.get(3), 42); } // ── BitMatrix: partial_group_presence_count ─────────────────────────────────── diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs index d9869aa..9abd7b5 100644 --- a/src/obicompactvec/src/tests/intmatrix.rs +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -1,7 +1,7 @@ use tempfile::tempdir; use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; -use crate::traits::{CountPartials, IntSlice}; +use crate::traits::CountPartials; fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { let n = cols.first().map_or(0, |c| c.len()); @@ -290,7 +290,7 @@ fn col_view_packed_matches_columnar() { } assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum"); let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect(); - let mut ov_ref: Vec<(usize, u32)> = col_ref.overflow_entries().collect(); + let mut ov_ref: Vec<(usize, u32)> = col_ref.view().overflow_entries().collect(); ov_view.sort_unstable_by_key(|&(s, _)| s); ov_ref.sort_unstable_by_key(|&(s, _)| s); assert_eq!(ov_view, ov_ref, "col={c} overflow_entries"); diff --git a/src/obicompactvec/src/tests/memoryvec.rs b/src/obicompactvec/src/tests/memoryvec.rs deleted file mode 100644 index 3fd4afb..0000000 --- a/src/obicompactvec/src/tests/memoryvec.rs +++ /dev/null @@ -1,484 +0,0 @@ -use tempfile::tempdir; - -use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit}; -use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder}; - -// ── MemoryBitVec ────────────────────────────────────────────────────────────── - -#[test] -fn mbv_new_all_zero() { - let v = MemoryBitVec::new(10); - assert_eq!(v.len(), 10); - assert!(!(0..10).any(|s| v.get(s))); - assert_eq!(v.count_ones(), 0); -} - -#[test] -fn mbv_ones_all_set() { - let v = MemoryBitVec::ones(10); - assert!((0..10).all(|s| v.get(s))); - assert_eq!(v.count_ones(), 10); - assert_eq!(v.count_zeros(), 0); -} - -#[test] -fn mbv_ones_no_padding_leak() { - // 5 bits: padding bits in last word must stay 0 - let v = MemoryBitVec::ones(5); - assert_eq!(v.words()[0], 0b11111); -} - -#[test] -fn mbv_set_get_roundtrip() { - let mut v = MemoryBitVec::new(64); - v.set(0, true); - v.set(63, true); - assert!(v.get(0)); - assert!(!v.get(1)); - assert!(v.get(63)); - assert_eq!(v.count_ones(), 2); -} - -#[test] -fn mbv_and() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(1, true); - let mut b = MemoryBitVec::new(4); - b.set(0, true); b.set(2, true); - a.and(&b); - assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2)); -} - -#[test] -fn mbv_or() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(1, true); - let mut b = MemoryBitVec::new(4); - b.set(0, true); b.set(2, true); - a.or(&b); - assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3)); -} - -#[test] -fn mbv_xor() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(1, true); - let mut b = MemoryBitVec::new(4); - b.set(0, true); b.set(2, true); - a.xor(&b); - assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3)); -} - -#[test] -fn mbv_not() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(2, true); - a.not(); - assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3)); -} - -#[test] -fn mbv_not_no_padding_leak() { - let mut v = MemoryBitVec::new(5); - v.not(); - assert_eq!(v.count_ones(), 5); - assert_eq!(v.words()[0], 0b11111); -} - -#[test] -fn mbv_ops_chaining() { - let mut a = MemoryBitVec::ones(8); - let b = MemoryBitVec::new(8); // all zeros - a.and(&b).or(&b).not(); - assert_eq!(a.count_ones(), 8); -} - -#[test] -fn mbv_std_ops_owned() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(1, true); - let mut b = MemoryBitVec::new(4); - b.set(1, true); b.set(2, true); - let c = a & &b; - assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2)); -} - -#[test] -fn mbv_std_ops_assign() { - let mut a = MemoryBitVec::new(4); - a.set(0, true); a.set(1, true); - let mut b = MemoryBitVec::new(4); - b.set(1, true); b.set(2, true); - a &= &b; - assert!(!a.get(0)); assert!(a.get(1)); -} - -#[test] -fn mbv_from_persistent() { - let dir = tempdir().unwrap(); - let path = dir.path().join("v.pbiv"); - let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap(); - builder.set(1, true); builder.set(3, true); - builder.close().unwrap(); - let pv = PersistentBitVec::open(&path).unwrap(); - let mv = MemoryBitVec::from(&pv); - assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3)); -} - -#[test] -fn mbv_persist_roundtrip() { - let dir = tempdir().unwrap(); - let path = dir.path().join("out.pbiv"); - let mut v = MemoryBitVec::new(8); - v.set(2, true); v.set(5, true); - let builder = v.persist(&path).unwrap(); - builder.close().unwrap(); - let pv = PersistentBitVec::open(&path).unwrap(); - assert!(pv.get(2)); assert!(pv.get(5)); - assert_eq!(pv.count_ones(), 2); -} - -// ── MemoryIntVec ────────────────────────────────────────────────────────────── - -#[test] -fn miv_new_all_zero() { - let v = MemoryIntVec::new(10); - assert_eq!(v.len(), 10); - assert!((0..10).all(|s| v.get(s) == 0)); -} - -#[test] -fn miv_set_get_roundtrip() { - let mut v = MemoryIntVec::new(4); - v.set(0, 42); v.set(3, 200); - assert_eq!(v.get(0), 42); - assert_eq!(v.get(1), 0); - assert_eq!(v.get(3), 200); -} - -#[test] -fn miv_overflow_roundtrip() { - let mut v = MemoryIntVec::new(4); - v.set(1, 1000); - assert_eq!(v.get(1), 1000); - assert_eq!(v.get(0), 0); -} - -#[test] -fn miv_inc_dec() { - let mut v = MemoryIntVec::new(4); - v.inc(2); v.inc(2); v.inc(2); - assert_eq!(v.get(2), 3); - v.dec(2); - assert_eq!(v.get(2), 2); -} - -#[test] -fn miv_dec_saturates_at_zero() { - let mut v = MemoryIntVec::new(4); - v.dec(0); - assert_eq!(v.get(0), 0); -} - -#[test] -fn miv_add_at() { - let mut v = MemoryIntVec::new(4); - v.add_at(1, 100); v.add_at(1, 200); - assert_eq!(v.get(1), 300); -} - -#[test] -fn miv_min_max() { - let mut a = MemoryIntVec::new(4); - a.set(0, 5); a.set(1, 2); a.set(2, 8); - let mut b = MemoryIntVec::new(4); - b.set(0, 3); b.set(1, 7); b.set(2, 8); - let mut c = MemoryIntVec::from(&a); - IntSliceMut::min(&mut c, &b); - assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8); - let mut d = MemoryIntVec::from(&a); - IntSliceMut::max(&mut d, &b); - assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8); -} - -#[test] -fn miv_add_diff() { - let mut a = MemoryIntVec::new(3); - a.set(0, 10); a.set(1, 5); - let mut b = MemoryIntVec::new(3); - b.set(0, 3); b.set(1, 8); - let mut c = MemoryIntVec::from(&a); - c.add(&b); - assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13); - let mut d = MemoryIntVec::from(&a); - d.diff(&b); - assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub -} - -#[test] -fn miv_std_ops() { - let mut a = MemoryIntVec::new(3); - a.set(0, 10); a.set(1, 5); - let mut b = MemoryIntVec::new(3); - b.set(0, 3); b.set(1, 8); - let c = &a + &b; - assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13); - let d = &a - &b; - assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); -} - -#[test] -fn miv_from_persistent() { - use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; - let dir = tempdir().unwrap(); - let path = dir.path().join("v.pciv"); - let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap(); - b.set(1, 42); b.set(3, 1000); - b.close().unwrap(); - let pv = PersistentCompactIntVec::open(&path).unwrap(); - let mv = MemoryIntVec::from(&pv); - assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000); -} - -// ── Cross-type conversions ──────────────────────────────────────────────────── - -#[test] -fn to_bitvec_threshold() { - let mut v = MemoryIntVec::new(5); - v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3); - let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass - assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2)); - assert!(bv.get(3)); assert!(!bv.get(4)); -} - -#[test] -fn to_presence() { - let mut v = MemoryIntVec::new(4); - v.set(1, 1); v.set(3, 100); - let bv = v.to_presence(); - assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); -} - -#[test] -fn to_intvec_roundtrip() { - let mut bv = MemoryBitVec::new(8); - bv.set(0, true); bv.set(3, true); bv.set(7, true); - let iv = bv.to_intvec(); - assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0); - assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1); -} - -#[test] -fn to_intvec_word_boundary() { - // 65 bits: spans two words - let mut bv = MemoryBitVec::new(65); - bv.set(63, true); bv.set(64, true); - let iv = bv.to_intvec(); - assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0); -} - -#[test] -fn count_bits_accumulates() { - let mut count = MemoryIntVec::new(8); - let mut b1 = MemoryBitVec::new(8); - b1.set(0, true); b1.set(2, true); - let mut b2 = MemoryBitVec::new(8); - b2.set(0, true); b2.set(3, true); - let mut b3 = MemoryBitVec::new(8); - b3.set(2, true); b3.set(3, true); - count.count_bits(&b1).count_bits(&b2).count_bits(&b3); - assert_eq!(count.get(0), 2); - assert_eq!(count.get(2), 2); - assert_eq!(count.get(3), 2); - assert_eq!(count.get(1), 0); -} - -#[test] -fn count_bits_skips_zero_words() { - // Entire first word is zero — should not touch those slots - let mut count = MemoryIntVec::new(128); - let mut bv = MemoryBitVec::new(128); - bv.set(64, true); bv.set(127, true); - count.count_bits(&bv); - assert_eq!(count.get(0), 0); - assert_eq!(count.get(64), 1); - assert_eq!(count.get(127), 1); -} - -// ── min / max / add / diff — overflow edge cases ────────────────────────────── - -#[test] -fn miv_min_overflow_edges() { - // [300, 50, 400, 300] min [50, 300, 500, 200] - // slot 0: self=overflow(300), other=primary(50) → 50 (overflow removed) - // slot 1: self=primary(50), other=overflow(300) → 50 (no overflow created) - // slot 2: self=overflow(400), other=overflow(500) → 400 (overflow updated) - // slot 3: self=overflow(300), other=primary(200) → 200 (overflow removed, 200 < 255) - let mut a = MemoryIntVec::new(4); - a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 300); - let mut b = MemoryIntVec::new(4); - b.set(0, 50); b.set(1, 300); b.set(2, 500); b.set(3, 200); - IntSliceMut::min(&mut a, &b); - assert_eq!(a.get(0), 50); - assert_eq!(a.get(1), 50); - assert_eq!(a.get(2), 400); - assert_eq!(a.get(3), 200); - // Only slot 2 should still have an overflow entry. - let ov: std::collections::HashMap = a.overflow_entries().collect(); - assert_eq!(ov.len(), 1); - assert_eq!(ov[&2], 400); -} - -#[test] -fn miv_max_overflow_edges() { - // [50, 300, 100, 400] max [300, 50, 500, 200] - // slot 0: self=primary(50), other=overflow(300) → 300 (overflow created) - // slot 1: self=overflow(300), other=primary(50) → 300 (overflow unchanged) - // slot 2: self=primary(100), other=overflow(500) → 500 (overflow created) - // slot 3: self=overflow(400), other=overflow(200) → 400 (overflow unchanged, 200 < 255 wait...) - // Wait — 200 < 255 so other slot 3 is NOT overflow. Correct: max(400, 200) = 400. - let mut a = MemoryIntVec::new(4); - a.set(0, 50); a.set(1, 300); a.set(2, 100); a.set(3, 400); - let mut b = MemoryIntVec::new(4); - b.set(0, 300); b.set(1, 50); b.set(2, 500); b.set(3, 200); - IntSliceMut::max(&mut a, &b); - assert_eq!(a.get(0), 300); - assert_eq!(a.get(1), 300); - assert_eq!(a.get(2), 500); - assert_eq!(a.get(3), 400); - let ov: std::collections::HashMap = a.overflow_entries().collect(); - assert_eq!(ov.len(), 4); // all four results >= 255 - assert_eq!(ov[&0], 300); - assert_eq!(ov[&1], 300); - assert_eq!(ov[&2], 500); - assert_eq!(ov[&3], 400); -} - -#[test] -fn miv_add_overflow_edges() { - // [300, 50, 400, 200] + [50, 300, 200, 200] - // slot 0: self=overflow(300), other=primary(50) → 350 (overflow updated) - // slot 1: self=primary(50), other=overflow(300) → 350 (overflow created from primary) - // slot 2: self=overflow(400), other=overflow(200... wait 200 < 255) - // other slot 2 is primary(200); 400+200=600 (overflow updated) - // slot 3: self=primary(200), other=primary(200) → 400 (overflow created, 400 >= 255) - let mut a = MemoryIntVec::new(4); - a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 200); - let mut b = MemoryIntVec::new(4); - b.set(0, 50); b.set(1, 300); b.set(2, 200); b.set(3, 200); - a.add(&b); - assert_eq!(a.get(0), 350); - assert_eq!(a.get(1), 350); - assert_eq!(a.get(2), 600); - assert_eq!(a.get(3), 400); - let ov: std::collections::HashMap = a.overflow_entries().collect(); - assert_eq!(ov.len(), 4); -} - -#[test] -fn miv_add_both_overflow() { - // [300] + [400] = [700] - let mut a = MemoryIntVec::new(1); - a.set(0, 300); - let mut b = MemoryIntVec::new(1); - b.set(0, 400); - a.add(&b); - assert_eq!(a.get(0), 700); - let ov: std::collections::HashMap = a.overflow_entries().collect(); - assert_eq!(ov[&0], 700); -} - -#[test] -fn miv_diff_overflow_edges() { - // [300, 400, 400, 50] - [100, 50, 350, 300] - // slot 0: self=overflow(300), other=primary(100) → 200 (overflow removed, 200 < 255) - // slot 1: self=overflow(400), other=primary(50) → 350 (overflow updated, 350 >= 255) - // slot 2: self=overflow(400), other=overflow(350) → 50 (overflow removed, 50 < 255) - // slot 3: self=primary(50), other=overflow(300) → 0 (saturating, stays primary) - let mut a = MemoryIntVec::new(4); - a.set(0, 300); a.set(1, 400); a.set(2, 400); a.set(3, 50); - let mut b = MemoryIntVec::new(4); - b.set(0, 100); b.set(1, 50); b.set(2, 350); b.set(3, 300); - a.diff(&b); - assert_eq!(a.get(0), 200); - assert_eq!(a.get(1), 350); - assert_eq!(a.get(2), 50); - assert_eq!(a.get(3), 0); - let ov: std::collections::HashMap = a.overflow_entries().collect(); - assert_eq!(ov.len(), 1); // only slot 1 remains overflow - assert_eq!(ov[&1], 350); -} - -// ── Comparison operators ────────────────────────────────────────────────────── - -#[test] -fn cmp_gt() { - let mut v = MemoryIntVec::new(5); - v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10); - let bv = v.gt(3); - assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2)); - assert!(!bv.get(3)); assert!(bv.get(4)); -} - -#[test] -fn cmp_geq() { - let mut v = MemoryIntVec::new(4); - v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1); - let bv = v.geq(3); - assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3)); -} - -#[test] -fn cmp_lt() { - let mut v = MemoryIntVec::new(4); - v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0); - let bv = v.lt(3); - assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); -} - -#[test] -fn cmp_leq() { - let mut v = MemoryIntVec::new(4); - v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3); - let bv = v.leq(3); - assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3)); -} - -#[test] -fn cmp_scalar_with_overflow() { - // Slots: [10, 1000, 50, 500, 0] - // geq(100): slots 1 (1000) and 3 (500) → both overflow, must qualify - // lt(500): slots 0 (10), 2 (50), 4 (0) → primary; slot 1 (1000) → no; slot 3 (500) → no - // geq(2000): only slot 1 (1000) fails, no slot qualifies - let mut v = MemoryIntVec::new(5); - v.set(0, 10); v.set(1, 1000); v.set(2, 50); v.set(3, 500); v.set(4, 0); - - let bv = v.geq(100); - assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); - assert!(bv.get(3)); assert!(!bv.get(4)); - - let bv = v.lt(500); - assert!(bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2)); - assert!(!bv.get(3)); assert!(bv.get(4)); - - let bv = v.geq(2000); - assert!(!(0..5).any(|s| bv.get(s))); -} - -#[test] -fn filter_pattern() { - // Typical filter: ingroup >= min_count AND outgroup <= max_outgroup - let mut ingroup = MemoryIntVec::new(6); - let mut outgroup = MemoryIntVec::new(6); - // slot 2: ingroup=3, outgroup=0 → keep - // slot 4: ingroup=2, outgroup=1 → drop (outgroup > 0) - // slot 5: ingroup=1, outgroup=0 → drop (ingroup < 2) - ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1); - outgroup.set(4, 1); - let out_mask = outgroup.leq(0); - let mut in_mask = ingroup.geq(2); - let keep = in_mask.and(&out_mask); - assert!(!keep.get(0)); assert!(!keep.get(1)); - assert!(keep.get(2)); - assert!(!keep.get(4)); assert!(!keep.get(5)); -} diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 3a61ab3..31f630e 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -2,12 +2,9 @@ mod bitmatrix; mod bitvec; mod colgroup; mod intmatrix; -mod memoryvec; use tempfile::tempdir; -use crate::traits::IntSliceMut; - use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder}; fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec { @@ -173,7 +170,7 @@ fn combine_min() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.min(&rb); + b.min(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![10, 100, 0, 800]); @@ -186,7 +183,7 @@ fn combine_max() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.max(&rb); + b.max(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![20, 300, 500, 1000]); @@ -199,7 +196,7 @@ fn combine_add() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.add(&rb); + b.add(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![30, 300, 5, 101]); @@ -224,7 +221,7 @@ fn combine_diff() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.diff(&rb); + b.diff(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![10, 700, 0, 0]); diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index 9a647ec..cc52bc1 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -1,353 +1,5 @@ -use std::collections::HashMap; - use ndarray::{Array1, Array2}; -// ── BitSlice / BitSliceMut ──────────────────────────────────────────────────── - -/// Read-only view over the u64 word array of a bit vector. -/// -/// Bit `i` is in `words()[i >> 6]` at position `i & 63`. -/// Padding bits in the last word are zero. -pub trait BitSlice { - fn len(&self) -> usize; - fn words(&self) -> &[u64]; - fn is_empty(&self) -> bool { self.len() == 0 } - fn get(&self, slot: usize) -> bool { - (self.words()[slot >> 6] >> (slot & 63)) & 1 != 0 - } - fn count_ones(&self) -> u64 { - self.words().iter().map(|w| w.count_ones() as u64).sum() - } - fn count_zeros(&self) -> u64 { self.len() as u64 - self.count_ones() } - fn partial_jaccard_dist(&self, other: &S) -> (u64, u64) { - assert_eq!(self.len(), other.len(), "length mismatch"); - self.words().iter().zip(other.words()) - .fold((0u64, 0u64), |(i, u), (&a, &b)| { - (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64) - }) - } - fn jaccard_dist(&self, other: &S) -> f64 { - let (inter, union) = self.partial_jaccard_dist(other); - if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } - } - fn hamming_dist(&self, other: &S) -> u64 { - assert_eq!(self.len(), other.len(), "length mismatch"); - self.words().iter().zip(other.words()) - .map(|(&a, &b)| (a ^ b).count_ones() as u64) - .sum() - } -} - -/// Mutable view over a bit-vector word array; default methods maintain the -/// zero-padding invariant on the last word. -pub trait BitSliceMut: BitSlice { - fn words_mut(&mut self) -> &mut [u64]; - - fn set(&mut self, slot: usize, value: bool) { - let bit = 1u64 << (slot & 63); - if value { self.words_mut()[slot >> 6] |= bit; } else { self.words_mut()[slot >> 6] &= !bit; } - } - - fn copy_from(&mut self, src: &S) -> &mut Self { - assert_eq!(self.len(), src.len(), "BitSlice length mismatch"); - self.words_mut().copy_from_slice(src.words()); - self - } - - fn and(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); - for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; } - self - } - - fn or(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); - for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; } - self - } - - fn xor(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "BitSlice length mismatch"); - for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; } - self - } - - fn not(&mut self) -> &mut Self { - let rem = self.len() % 64; - let words = self.words_mut(); - for w in words.iter_mut() { *w ^= u64::MAX; } - if rem != 0 { - if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; } - } - self - } -} - -// ── IntSlice / IntSliceMut ──────────────────────────────────────────────────── - -/// Read-only access to a compact integer vector (values encoded as u32). -pub trait IntSlice { - fn len(&self) -> usize; - fn get(&self, slot: usize) -> u32; - /// Raw primary byte slice (sentinel 255 marks overflow slots). - fn primary_bytes(&self) -> &[u8]; - /// Iterator over `(slot, true_value)` pairs for all overflow entries (value >= 255). - fn overflow_entries(&self) -> impl Iterator + '_; - fn is_empty(&self) -> bool { self.len() == 0 } - fn iter(&self) -> impl Iterator + '_ { (0..self.len()).map(|i| self.get(i)) } - fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() } - fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 } - - fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) } - fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) } - fn gt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v > threshold) } - fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) } - - fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec { - let n = self.len(); - let mut words = vec![0u64; n.div_ceil(64)]; - let primary = self.primary_bytes(); - // Pass 1: byte scan — no HashMap access, vectorisable for simple predicates. - // Overflow slots (b == 255) are left as 0 and fixed in pass 2. - for s in 0..n { - let b = primary[s]; - if b < 255 && pred(b as u32) { - words[s >> 6] |= 1u64 << (s & 63); - } - } - // Pass 2: fix up overflow slots — O(k), negligible. - for (s, val) in self.overflow_entries() { - if pred(val) { words[s >> 6] |= 1u64 << (s & 63); } - } - MemoryBitVec::from_words(words, n) - } -} - -/// Mutable access; default methods use only `get` / `set` and maintain the -/// compact encoding invariants on the implementor's side. -pub trait IntSliceMut: IntSlice { - fn set(&mut self, slot: usize, value: u32); - fn primary_bytes_mut(&mut self) -> &mut [u8]; - fn clear_overflow(&mut self); - - fn inc(&mut self, slot: usize) -> &mut Self { - let v = self.get(slot); - self.set(slot, v.saturating_add(1)); - self - } - - fn dec(&mut self, slot: usize) -> &mut Self { - let v = self.get(slot); - self.set(slot, v.saturating_sub(1)); - self - } - - fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self { - let v = self.get(slot); - self.set(slot, v.saturating_add(delta)); - self - } - - fn copy_from(&mut self, src: &S) -> &mut Self { - assert_eq!(self.len(), src.len(), "IntSlice length mismatch"); - self.primary_bytes_mut().copy_from_slice(src.primary_bytes()); - self.clear_overflow(); - for (slot, val) in src.overflow_entries() { self.set(slot, val); } - self - } - - fn min(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); - // Snapshot both overflow sets (O(k), tiny) before mutating self. - // 255 = +∞ on u8, so byte-level min is correct in all cases except - // both-overflow: only those slots need a fixup pass. - let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect(); - let other_ov: HashMap = other.overflow_entries().collect(); - self.clear_overflow(); - // Pass 1 — SIMD-vectorizable byte min over the full primary array. - for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { - if b < *a { *a = b; } - } - // Pass 2 — fixup slots where BOTH sides were overflow (primary = 255 after pass 1, - // but the overflow value may have changed). Slots where only self was overflow are - // already correct: pass 1 wrote other.primary[slot] < 255 and clear_overflow removed - // the stale entry. - for (slot, self_val) in self_ov { - if let Some(&other_val) = other_ov.get(&slot) { - self.set(slot, self_val.min(other_val)); - } - } - self - } - - fn max(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); - // Pre-pass — process other's overflow entries BEFORE the byte pass. - // After the byte pass, self.primary[slot] = 255 for all slots in other_ov, - // making it impossible to recover the original self value; we need it now. - for (slot, other_val) in other.overflow_entries() { - let self_val = self.get(slot); - self.set(slot, self_val.max(other_val)); - } - // Pass 1 — SIMD-vectorizable byte max over the full primary array. - // 255 = +∞ on u8 → max(a, 255) = 255 is the correct sentinel for all - // overflow slots, whether handled by the pre-pass or already in self. - for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { - if b > *a { *a = b; } - } - self - } - - fn add(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); - let n = self.len(); - for s in 0..n { - // Read both primary bytes first — u8 is Copy, borrows released immediately. - let sb = self.primary_bytes()[s]; - let ob = other.primary_bytes()[s]; - if sb < 255 && ob < 255 { - // Hot path: no overflow lookup, no HashMap write in the common case. - let sum = sb as u32 + ob as u32; - if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; } - else { self.set(s, sum); } - } else { - // At least one side is in overflow — get() is unavoidable. - let self_val = self.get(s); - let other_val = other.get(s); - self.set(s, self_val + other_val); - } - } - self - } - - fn diff(&mut self, other: &S) -> &mut Self { - assert_eq!(self.len(), other.len(), "IntSlice length mismatch"); - let n = self.len(); - for s in 0..n { - let sb = self.primary_bytes()[s]; - let ob = other.primary_bytes()[s]; - if sb < 255 { - // Result is always < 255 — no overflow created or consulted. - // ob == 255 means b ≥ 255 > a, so saturating result = 0. - self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }; - } else { - // sb == 255: self has overflow — get() unavoidable. - // other.get() only needed when ob == 255 too (both-overflow case). - let self_val = self.get(s); - let other_val = if ob < 255 { ob as u32 } else { other.get(s) }; - self.set(s, self_val.saturating_sub(other_val)); - } - } - self - } - - /// For each slot where `bits` is true, increment `self` by 1. - /// Skips zero words entirely — O(n_ones) rather than O(n). - fn count_bits(&mut self, bits: &B) -> &mut Self { - assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch"); - for (w_idx, &word) in bits.words().iter().enumerate() { - if word == 0 { continue; } - let base = w_idx * 64; - let mut w = word; - while w != 0 { - let bit = w.trailing_zeros() as usize; - let slot = base + bit; - if slot < self.len() { self.inc(slot); } - w &= w - 1; - } - } - self - } - - /// Zero every slot where the corresponding bit in `mask` is 0. - /// Iterates only the zero bits — O(n_zeros), O(1) when mask is all-ones. - fn mask_with(&mut self, mask: &B) -> &mut Self { - assert_eq!(self.len(), mask.len(), "IntSlice/BitSlice length mismatch"); - let n = self.len(); - for (wi, &word) in mask.words().iter().enumerate() { - if word == u64::MAX { continue; } - let mut zeros = !word; - while zeros != 0 { - let bit = zeros.trailing_zeros() as usize; - let s = wi * 64 + bit; - if s < n { - // u8 is Copy — the immutable borrow from primary_bytes() ends - // before the mutable borrow from set() begins. - let b = self.primary_bytes()[s]; - if b != 0 { self.set(s, 0); } - } - zeros &= zeros - 1; - } - } - self - } -} - -// ── IntSlice → MemoryBitVec conversions ─────────────────────────────────────── - -use crate::memoryvec::MemoryBitVec; - -pub trait IntToBit: IntSlice { - /// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`. - fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) } - - /// Bit set iff value >= 1 (slot is present). - fn to_presence(&self) -> MemoryBitVec { self.geq(1) } -} - -impl IntToBit for T {} - -// ── BitSlice → MemoryIntVec conversion ─────────────────────────────────────── - -use crate::memoryintvec::MemoryIntVec; - -// Maps each byte value to its 8 constituent bits as individual u8 (0 or 1). -static EXPAND_BYTE: [[u8; 8]; 256] = { - let mut table = [[0u8; 8]; 256]; - let mut b = 0usize; - while b < 256 { - let mut bit = 0usize; - while bit < 8 { - table[b][bit] = ((b >> bit) & 1) as u8; - bit += 1; - } - b += 1; - } - table -}; - -pub trait BitToInt: BitSlice { - fn to_intvec(&self) -> MemoryIntVec { - let n = self.len(); - let mut primary = vec![0u8; n]; - - let words = self.words(); - let full_words = n / 64; - - for (w_idx, &word) in words[..full_words].iter().enumerate() { - let base = w_idx * 64; - for byte_off in 0..8usize { - let byte = (word >> (byte_off * 8)) as u8; - primary[base + byte_off * 8..base + byte_off * 8 + 8] - .copy_from_slice(&EXPAND_BYTE[byte as usize]); - } - } - - let rem = n % 64; - if rem > 0 { - let word = words[full_words]; - let base = full_words * 64; - for bit in 0..rem { - primary[base + bit] = ((word >> bit) & 1) as u8; - } - } - - MemoryIntVec::from_primary(primary) - } -} - -impl BitToInt for T {} - // ── Column-level weight statistic — total count or presence count per column. /// Additive across layers and partitions; used as denominator in normalised distances. /// diff --git a/src/obicompactvec/src/views.rs b/src/obicompactvec/src/views.rs new file mode 100644 index 0000000..85e4165 --- /dev/null +++ b/src/obicompactvec/src/views.rs @@ -0,0 +1,278 @@ +use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry}; + +// ── BitSliceView ────────────────────────────────────────────────────────────── + +/// Lightweight, copy-able read-only view over a u64 word array. +/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero. +#[derive(Clone, Copy)] +pub struct BitSliceView<'a> { + pub(crate) words: &'a [u64], + pub(crate) n: usize, +} + +impl<'a> BitSliceView<'a> { + #[inline] + pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + pub fn words(&self) -> &'a [u64] { self.words } + + #[inline] + pub fn get(&self, slot: usize) -> bool { + (self.words[slot >> 6] >> (slot & 63)) & 1 != 0 + } + + pub fn count_ones(&self) -> u64 { + self.words.iter().map(|w| w.count_ones() as u64).sum() + } + pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() } + + pub fn iter(&self) -> BitSliceIter<'a> { + BitSliceIter { words: self.words, slot: 0, n: self.n } + } + + pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) { + assert_eq!(self.n, other.n, "BitSliceView length mismatch"); + self.words.iter().zip(other.words) + .fold((0u64, 0u64), |(i, u), (&a, &b)| { + (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64) + }) + } + + pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 { + let (inter, union) = self.partial_jaccard_dist(other); + if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } + } + + pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 { + assert_eq!(self.n, other.n, "BitSliceView length mismatch"); + self.words.iter().zip(other.words) + .map(|(&a, &b)| (a ^ b).count_ones() as u64) + .sum() + } +} + +// ── BitSliceIter ────────────────────────────────────────────────────────────── + +pub struct BitSliceIter<'a> { + words: &'a [u64], + slot: usize, + n: usize, +} + +impl Iterator for BitSliceIter<'_> { + type Item = bool; + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0; + self.slot += 1; + Some(v) + } + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} +impl ExactSizeIterator for BitSliceIter<'_> {} + +// ── IntSliceView ────────────────────────────────────────────────────────────── + +/// Lightweight, copy-able read-only view over a compact-int primary array plus +/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap. +#[derive(Clone, Copy)] +pub struct IntSliceView<'a> { + pub(crate) primary: &'a [u8], + pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot + pub(crate) n_overflow: usize, + pub(crate) n: usize, +} + +impl<'a> IntSliceView<'a> { + #[inline] + pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self { + Self { primary, overflow_raw, n_overflow, n } + } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + pub fn primary_bytes(&self) -> &'a [u8] { self.primary } + pub fn n_overflow(&self) -> usize { self.n_overflow } + + pub fn overflow_entries(&self) -> impl Iterator + 'a { + let raw = self.overflow_raw; + let n_ov = self.n_overflow; + (0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i)) + } + + /// O(log n_overflow) via binary search (overflow is always sorted by slot). + pub fn get(&self, slot: usize) -> u32 { + let b = self.primary[slot]; + if b < 255 { return b as u32; } + let mut lo = 0usize; + let mut hi = self.n_overflow; + while lo < hi { + let mid = lo + (hi - lo) / 2; + let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid); + match s.cmp(&slot) { + std::cmp::Ordering::Equal => return v, + std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Greater => hi = mid, + } + } + panic!("slot {slot} marked overflow but not found") + } + + /// Sequential merge scan: yields all n values in slot order. + pub fn iter(&self) -> IntSliceViewIter<'a> { + IntSliceViewIter { + primary: self.primary, + overflow_raw: self.overflow_raw, + slot: 0, + overflow_pos: 0, + n: self.n, + } + } + + pub fn sum(&self) -> u64 { + byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v)) + } + + pub fn count_nonzero(&self) -> u64 { + byte_count_nonzero(self.primary) + } + + // ── Distance methods ────────────────────────────────────────────────────── + + pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum() + } + + pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 { + let sum_min = self.partial_bray_dist(other); + let denom = self.sum() + other.sum(); + if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 } + } + + pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 }; + let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 }; + pa.min(pb) + }) + .sum() + } + + pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + 1.0 - self.partial_relfreq_bray_dist(other, sa, sb) + } + + pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) + .sum() + } + + pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + self.partial_euclidean_dist(other).sqrt() + } + + pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 }; + let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum() + } + + pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt() + } + + pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 }; + let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum() + } + + pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt() + } + + pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 { + self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2 + } + + pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .fold((0u64, 0u64), |(inter, uni), (a, b)| { + let ap = a >= threshold; + let bp = b >= threshold; + (inter + (ap & bp) as u64, uni + (ap | bp) as u64) + }) + } + + pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 { + let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold); + if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } + } + + pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 { + self.threshold_jaccard_dist(other, 1) + } +} + +// ── IntSliceViewIter ────────────────────────────────────────────────────────── + +pub struct IntSliceViewIter<'a> { + primary: &'a [u8], + overflow_raw: &'a [u8], + slot: usize, + overflow_pos: usize, + n: usize, +} + +impl Iterator for IntSliceViewIter<'_> { + type Item = u32; + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = self.primary[self.slot]; + self.slot += 1; + if v < 255 { + Some(v as u32) + } else { + let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos); + self.overflow_pos += 1; + Some(val) + } + } + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} +impl ExactSizeIterator for IntSliceViewIter<'_> {} diff --git a/src/obikpartitionner/src/common.rs b/src/obikpartitionner/src/common.rs index 76d3bf3..99e345e 100644 --- a/src/obikpartitionner/src/common.rs +++ b/src/obikpartitionner/src/common.rs @@ -3,7 +3,6 @@ use std::io; use std::path::{Path, PathBuf}; use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder}; -use obicompactvec::traits::BitSliceMut; use obilayeredmap::meta::PartitionMeta; use obilayeredmap::{IndexMode, OLMError}; use obiskio::{SKError, SKResult}; diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs index 56b2ac7..36286c0 100644 --- a/src/obikpartitionner/src/select_layer.rs +++ b/src/obikpartitionner/src/select_layer.rs @@ -6,7 +6,6 @@ use obicompactvec::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, }; -use obicompactvec::traits::BitSliceMut; use obilayeredmap::meta::PartitionMeta; use obilayeredmap::OLMError; use obiskio::{SKError, SKResult}; diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index c79e781..72b38ea 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -6,7 +6,6 @@ use obicompactvec::{ PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; -use obicompactvec::traits::BitSliceMut; use obikseq::CanonicalKmer; use obiskio::{UnitigFileReader, UnitigFileWriter}; diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs index 6ebf343..433183e 100644 --- a/src/obilayeredmap/src/layered_store.rs +++ b/src/obilayeredmap/src/layered_store.rs @@ -102,7 +102,6 @@ mod tests { PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; - use obicompactvec::traits::BitSliceMut; use tempfile::tempdir; fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {