refactor(obicompactvec): unify bit and int vector slice views
Refactors column and matrix access to use unified `BitSliceView` and `IntSliceView` abstractions, replacing legacy `PackedCol`/`IntColView` types. Introduces `BitSlice`/`IntSlice` traits for zero-copy, trait-based bitwise and arithmetic operations across persistent and temporary vector types. Removes deprecated in-memory `MemoryBitVec` and `MemoryIntVec` implementations and their tests, while updating dependent crates to use the new view-based API and `BitSliceMut` trait.
This commit is contained in:
@@ -7,13 +7,12 @@ use ndarray::{Array1, Array2};
|
|||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
|
||||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||||
use crate::memoryvec::MemoryBitVec;
|
|
||||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
|
||||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
|
||||||
use crate::traits::{BitSlice, BitSliceMut, IntSliceMut};
|
|
||||||
use crate::layer_meta::LayerMeta;
|
use crate::layer_meta::LayerMeta;
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
|
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
|
use crate::views::BitSliceView;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pbiv"))
|
dir.join(format!("col_{col:06}.pbiv"))
|
||||||
@@ -143,18 +142,14 @@ impl PackedBitMatrix {
|
|||||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_slice(&self, c: usize) -> PackedCol<'_> {
|
pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> {
|
||||||
PackedCol { words: self.col_words(c), n: self.n_rows }
|
BitSliceView::new(self.col_words(c), self.n_rows)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
||||||
PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
|
PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec {
|
|
||||||
MemoryBitVec::from(&self.col_slice(c))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn count_ones(&self) -> Array1<u64> {
|
pub(crate) fn count_ones(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter()
|
(0..self.n_cols).into_par_iter()
|
||||||
@@ -165,47 +160,17 @@ impl PackedBitMatrix {
|
|||||||
|
|
||||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||||
pairwise2_matrix(self.n_cols, |i, j| {
|
pairwise2_matrix(self.n_cols, |i, j| {
|
||||||
self.col_slice(i).partial_jaccard_dist(&self.col_slice(j))
|
self.col_slice(i).partial_jaccard_dist(self.col_slice(j))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| {
|
pairwise_matrix(self.n_cols, |i, j| {
|
||||||
self.col_slice(i).hamming_dist(&self.col_slice(j))
|
self.col_slice(i).hamming_dist(self.col_slice(j))
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct PackedCol<'a> {
|
|
||||||
words: &'a [u64],
|
|
||||||
n: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BitSlice for PackedCol<'_> {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn words(&self) -> &[u64] { self.words }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── BitColView — uniform column access across Columnar and Packed ─────────────
|
|
||||||
|
|
||||||
enum BitColViewInner<'a> {
|
|
||||||
Columnar(&'a PersistentBitVec),
|
|
||||||
Packed(PackedCol<'a>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Opaque column view returned by [`PersistentBitMatrix::col_view`].
|
|
||||||
/// Implements [`BitSlice`] uniformly for both Columnar and Packed matrix formats.
|
|
||||||
pub struct BitColView<'a>(BitColViewInner<'a>);
|
|
||||||
|
|
||||||
impl BitSlice for BitColView<'_> {
|
|
||||||
fn len(&self) -> usize {
|
|
||||||
match &self.0 { BitColViewInner::Columnar(c) => c.len(), BitColViewInner::Packed(c) => c.len() }
|
|
||||||
}
|
|
||||||
fn words(&self) -> &[u64] {
|
|
||||||
match &self.0 { BitColViewInner::Columnar(c) => c.words(), BitColViewInner::Packed(c) => c.words() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
|
||||||
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pbmx");
|
let packed_path = dir.join("matrix.pbmx");
|
||||||
@@ -321,10 +286,10 @@ impl PersistentBitMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn col_view(&self, c: usize) -> BitColView<'_> {
|
pub fn col_view(&self, c: usize) -> BitSliceView<'_> {
|
||||||
match self {
|
match self {
|
||||||
Self::Columnar(m) => BitColView(BitColViewInner::Columnar(m.col(c))),
|
Self::Columnar(m) => m.col(c).view(),
|
||||||
Self::Packed(m) => BitColView(BitColViewInner::Packed(m.col_slice(c))),
|
Self::Packed(m) => m.col_slice(c),
|
||||||
Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
|
Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -341,14 +306,6 @@ impl PersistentBitMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn col_as_memory(&self, c: usize) -> MemoryBitVec {
|
|
||||||
match self {
|
|
||||||
Self::Columnar(m) => MemoryBitVec::from(m.col(c)),
|
|
||||||
Self::Packed(m) => m.col_as_memory(c),
|
|
||||||
Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
pub fn row(&self, slot: usize) -> Box<[bool]> {
|
||||||
match self {
|
match self {
|
||||||
Self::Columnar(m) => m.row(slot),
|
Self::Columnar(m) => m.row(slot),
|
||||||
@@ -458,27 +415,19 @@ impl MatrixGroupOps for PersistentBitMatrix {
|
|||||||
let n = self.n();
|
let n = self.n();
|
||||||
if g.indices.len() < 255 {
|
if g.indices.len() < 255 {
|
||||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||||
{
|
|
||||||
let primary = builder.primary_bytes_mut();
|
|
||||||
for &c in &g.indices {
|
for &c in &g.indices {
|
||||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
builder.inc_present_fast(self.col_view(c));
|
||||||
inc_primary_bits(primary, &mbv);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
builder.freeze()
|
builder.freeze()
|
||||||
} else {
|
} else {
|
||||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
for chunk in g.indices.chunks(254) {
|
for chunk in g.indices.chunks(254) {
|
||||||
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
|
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||||
{
|
|
||||||
let primary = chunk_builder.primary_bytes_mut();
|
|
||||||
for &c in chunk {
|
for &c in chunk {
|
||||||
let mbv = MemoryBitVec::from(&self.col_view(c));
|
chunk_b.inc_present_fast(self.col_view(c));
|
||||||
inc_primary_bits(primary, &mbv);
|
|
||||||
}
|
}
|
||||||
}
|
let frozen = chunk_b.freeze()?;
|
||||||
let chunk_frozen = chunk_builder.freeze()?;
|
result.add(frozen.view());
|
||||||
IntSliceMut::add(&mut result, &chunk_frozen);
|
|
||||||
}
|
}
|
||||||
result.freeze()
|
result.freeze()
|
||||||
}
|
}
|
||||||
@@ -493,7 +442,7 @@ impl MatrixGroupOps for PersistentBitMatrix {
|
|||||||
let n = self.n();
|
let n = self.n();
|
||||||
let mut result = TempBitVecBuilder::new(n)?;
|
let mut result = TempBitVecBuilder::new(n)?;
|
||||||
for &c in &g.indices {
|
for &c in &g.indices {
|
||||||
result.or(&self.col_view(c));
|
result.or(self.col_view(c));
|
||||||
}
|
}
|
||||||
result.freeze()
|
result.freeze()
|
||||||
}
|
}
|
||||||
|
|||||||
+103
-103
@@ -5,25 +5,21 @@ use std::path::{Path, PathBuf};
|
|||||||
use memmap2::{Mmap, MmapMut};
|
use memmap2::{Mmap, MmapMut};
|
||||||
|
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::views::{BitSliceView, BitSliceIter};
|
||||||
|
|
||||||
const MAGIC: [u8; 4] = *b"PBIV";
|
const MAGIC: [u8; 4] = *b"PBIV";
|
||||||
|
|
||||||
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
// Header: magic(4) + _pad(4) + n(8) = 16 bytes.
|
||||||
// Data starts at offset 16, which is divisible by 8 → u64-aligned
|
// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0).
|
||||||
// (mmap base is page-aligned, 16 % 8 == 0).
|
|
||||||
const HEADER_SIZE: usize = 16;
|
const HEADER_SIZE: usize = 16;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn n_words(n: usize) -> usize {
|
pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) }
|
||||||
n.div_ceil(64)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn n_bytes_for_words(n: usize) -> usize {
|
fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }
|
||||||
n_words(n) * 8
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Reader ────────────────────────────────────────────────────────────────────
|
// ── PersistentBitVec ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PersistentBitVec {
|
pub struct PersistentBitVec {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
@@ -35,44 +31,49 @@ impl PersistentBitVec {
|
|||||||
pub fn open(path: &Path) -> io::Result<Self> {
|
pub fn open(path: &Path) -> io::Result<Self> {
|
||||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||||
if mmap.len() < HEADER_SIZE {
|
if mmap.len() < HEADER_SIZE {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
|
||||||
io::ErrorKind::InvalidData,
|
|
||||||
"PBIV file too short",
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
if &mmap[0..4] != &MAGIC {
|
if &mmap[0..4] != &MAGIC {
|
||||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
|
||||||
}
|
}
|
||||||
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
|
||||||
Ok(Self {
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn path(&self) -> &Path {
|
pub fn path(&self) -> &Path { &self.path }
|
||||||
&self.path
|
pub fn len(&self) -> usize { self.n }
|
||||||
}
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.n
|
|
||||||
}
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> bool {
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
|
// SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned.
|
||||||
// so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
|
|
||||||
fn data_words(&self) -> &[u64] {
|
fn data_words(&self) -> &[u64] {
|
||||||
let nw = n_words(self.n);
|
let nw = n_words(self.n);
|
||||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||||
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
unsafe { std::slice::from_raw_parts(ptr, nw) }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
|
BitSliceView::new(self.data_words(), self.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||||
|
|
||||||
|
pub fn count_ones(&self) -> u64 { self.view().count_ones() }
|
||||||
|
pub fn count_zeros(&self) -> u64 { self.view().count_zeros() }
|
||||||
|
|
||||||
|
pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
|
||||||
|
self.view().partial_jaccard_dist(other.view())
|
||||||
|
}
|
||||||
|
pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
|
||||||
|
self.view().jaccard_dist(other.view())
|
||||||
|
}
|
||||||
|
pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
|
||||||
|
self.view().hamming_dist(other.view())
|
||||||
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> BitIter<'_> {
|
pub fn iter(&self) -> BitIter<'_> {
|
||||||
BitIter { words: self.data_words(), slot: 0, n: self.n }
|
BitIter { words: self.data_words(), slot: 0, n: self.n }
|
||||||
}
|
}
|
||||||
@@ -81,11 +82,11 @@ impl PersistentBitVec {
|
|||||||
impl<'a> IntoIterator for &'a PersistentBitVec {
|
impl<'a> IntoIterator for &'a PersistentBitVec {
|
||||||
type Item = bool;
|
type Item = bool;
|
||||||
type IntoIter = BitIter<'a>;
|
type IntoIter = BitIter<'a>;
|
||||||
fn into_iter(self) -> BitIter<'a> {
|
fn into_iter(self) -> BitIter<'a> { self.iter() }
|
||||||
self.iter()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── BitIter ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct BitIter<'a> {
|
pub struct BitIter<'a> {
|
||||||
pub(crate) words: &'a [u64],
|
pub(crate) words: &'a [u64],
|
||||||
pub(crate) slot: usize,
|
pub(crate) slot: usize,
|
||||||
@@ -96,21 +97,19 @@ impl ExactSizeIterator for BitIter<'_> {}
|
|||||||
|
|
||||||
impl Iterator for BitIter<'_> {
|
impl Iterator for BitIter<'_> {
|
||||||
type Item = bool;
|
type Item = bool;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<bool> {
|
fn next(&mut self) -> Option<bool> {
|
||||||
if self.slot >= self.n { return None; }
|
if self.slot >= self.n { return None; }
|
||||||
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||||
self.slot += 1;
|
self.slot += 1;
|
||||||
Some(v)
|
Some(v)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
let rem = self.n - self.slot;
|
let rem = self.n - self.slot;
|
||||||
(rem, Some(rem))
|
(rem, Some(rem))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
// ── PersistentBitVecBuilder ───────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct PersistentBitVecBuilder {
|
pub struct PersistentBitVecBuilder {
|
||||||
mmap: MmapMut,
|
mmap: MmapMut,
|
||||||
@@ -122,13 +121,10 @@ impl PersistentBitVecBuilder {
|
|||||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
let mut file = OpenOptions::new()
|
let mut file = OpenOptions::new()
|
||||||
.read(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
file.write_all(&MAGIC)?;
|
file.write_all(&MAGIC)?;
|
||||||
file.write_all(&[0u8; 4])?; // padding
|
file.write_all(&[0u8; 4])?;
|
||||||
file.write_all(&(n as u64).to_le_bytes())?;
|
file.write_all(&(n as u64).to_le_bytes())?;
|
||||||
file.seek(SeekFrom::Start(0))?;
|
file.seek(SeekFrom::Start(0))?;
|
||||||
file.set_len(file_size as u64)?;
|
file.set_len(file_size as u64)?;
|
||||||
@@ -136,8 +132,6 @@ impl PersistentBitVecBuilder {
|
|||||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary.
|
|
||||||
/// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits.
|
|
||||||
pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
@@ -159,17 +153,46 @@ impl PersistentBitVecBuilder {
|
|||||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result<Self> {
|
||||||
self.n
|
let n = source.len();
|
||||||
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
|
let mut file = OpenOptions::new()
|
||||||
|
.read(true).write(true).create(true).truncate(true)
|
||||||
|
.open(path)?;
|
||||||
|
file.write_all(&MAGIC)?;
|
||||||
|
file.write_all(&[0u8; 4])?;
|
||||||
|
file.write_all(&(n as u64).to_le_bytes())?;
|
||||||
|
file.seek(SeekFrom::Start(0))?;
|
||||||
|
file.set_len(file_size as u64)?;
|
||||||
|
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
{
|
||||||
|
let nw = n_words(n);
|
||||||
|
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
||||||
|
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
||||||
|
for (slot, count) in source.iter().enumerate() {
|
||||||
|
if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); }
|
||||||
}
|
}
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
}
|
||||||
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||||
|
Self::build_from_counts(source, 1, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.n }
|
||||||
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> bool {
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
(self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set(&mut self, slot: usize, value: bool) {
|
||||||
|
let bit = 1u64 << (slot & 63);
|
||||||
|
if value { self.data_words_mut()[slot >> 6] |= bit; }
|
||||||
|
else { self.data_words_mut()[slot >> 6] &= !bit; }
|
||||||
|
}
|
||||||
|
|
||||||
fn data_words(&self) -> &[u64] {
|
fn data_words(&self) -> &[u64] {
|
||||||
let nw = n_words(self.n);
|
let nw = n_words(self.n);
|
||||||
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
|
||||||
@@ -183,73 +206,50 @@ impl PersistentBitVecBuilder {
|
|||||||
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert a count vector to a bit vector: bit set iff count >= threshold.
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
/// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
|
BitSliceView::new(self.data_words(), self.n)
|
||||||
pub fn build_from_counts(
|
|
||||||
source: &PersistentCompactIntVec,
|
|
||||||
threshold: u32,
|
|
||||||
path: &Path,
|
|
||||||
) -> io::Result<Self> {
|
|
||||||
let n = source.len();
|
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
|
||||||
let mut file = OpenOptions::new()
|
|
||||||
.read(true)
|
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
|
||||||
file.write_all(&MAGIC)?;
|
|
||||||
file.write_all(&[0u8; 4])?;
|
|
||||||
file.write_all(&(n as u64).to_le_bytes())?;
|
|
||||||
file.seek(SeekFrom::Start(0))?;
|
|
||||||
file.set_len(file_size as u64)?;
|
|
||||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
|
||||||
|
|
||||||
{
|
|
||||||
let nw = n_words(n);
|
|
||||||
let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
|
|
||||||
let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
|
|
||||||
for (slot, count) in source.iter().enumerate() {
|
|
||||||
if count >= threshold {
|
|
||||||
words[slot >> 6] |= 1u64 << (slot & 63);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn words(&self) -> &[u64] { self.data_words() }
|
||||||
|
|
||||||
|
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, src.len(), "BitSliceView length mismatch");
|
||||||
|
self.data_words_mut().copy_from_slice(src.words());
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||||
|
assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
|
||||||
|
for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn not(&mut self) {
|
||||||
|
let rem = self.n % 64;
|
||||||
|
let words = self.data_words_mut();
|
||||||
|
for w in words.iter_mut() { *w ^= u64::MAX; }
|
||||||
|
if rem != 0 {
|
||||||
|
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||||
|
self.view().iter()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Convert a count vector to a presence/absence bit vector (threshold = 1).
|
pub fn close(self) -> io::Result<()> { self.mmap.flush() }
|
||||||
pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
|
||||||
Self::build_from_counts(source, 1, path)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
|
||||||
self.mmap.flush()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Flush, close, and reopen as a read-only `PersistentBitVec`.
|
|
||||||
pub fn finish(self) -> io::Result<PersistentBitVec> {
|
pub fn finish(self) -> io::Result<PersistentBitVec> {
|
||||||
let path = self.path.clone();
|
let path = self.path.clone();
|
||||||
self.close()?;
|
self.close()?;
|
||||||
PersistentBitVec::open(&path)
|
PersistentBitVec::open(&path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── BitSlice / BitSliceMut impls ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
use crate::traits::{BitSlice, BitSliceMut};
|
|
||||||
|
|
||||||
impl BitSlice for PersistentBitVec {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn words(&self) -> &[u64] { self.data_words() }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BitSlice for PersistentBitVecBuilder {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn words(&self) -> &[u64] { self.data_words() }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BitSliceMut for PersistentBitVecBuilder {
|
|
||||||
fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() }
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ use memmap2::MmapMut;
|
|||||||
|
|
||||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
|
use crate::views::{BitSliceView, IntSliceView};
|
||||||
|
|
||||||
pub struct PersistentCompactIntVecBuilder {
|
pub struct PersistentCompactIntVecBuilder {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
@@ -16,44 +17,16 @@ pub struct PersistentCompactIntVecBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntVecBuilder {
|
impl PersistentCompactIntVecBuilder {
|
||||||
/// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
|
|
||||||
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
pub fn new(n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file = OpenOptions::new()
|
|
||||||
.read(true)
|
|
||||||
.write(true)
|
|
||||||
.create(true)
|
|
||||||
.truncate(true)
|
|
||||||
.open(path)?;
|
|
||||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
|
||||||
Ok(Self {
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
overflow: HashMap::new(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap.
|
|
||||||
/// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead.
|
|
||||||
pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result<Self> {
|
|
||||||
let n = src.len();
|
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
.read(true).write(true).create(true).truncate(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
.open(path)?;
|
.open(path)?;
|
||||||
file.set_len((HEADER_SIZE + n) as u64)?;
|
file.set_len((HEADER_SIZE + n) as u64)?;
|
||||||
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes());
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
|
||||||
Ok(Self {
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
overflow: src.overflow_map().clone(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create from raw primary bytes + an already-built overflow map (no per-slot overhead).
|
pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
|
||||||
pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
|
|
||||||
let n = primary.len();
|
let n = primary.len();
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
.read(true).write(true).create(true).truncate(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
@@ -64,39 +37,24 @@ impl PersistentCompactIntVecBuilder {
|
|||||||
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
|
|
||||||
/// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
|
|
||||||
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
|
||||||
fs::copy(source.path(), path)?;
|
fs::copy(source.path(), path)?;
|
||||||
|
|
||||||
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
let file = OpenOptions::new().read(true).write(true).open(path)?;
|
||||||
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
let mmap = unsafe { MmapMut::map_mut(&file)? };
|
||||||
|
|
||||||
let n = source.len();
|
let n = source.len();
|
||||||
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
|
||||||
let data_offset = HEADER_SIZE + n;
|
let data_offset = HEADER_SIZE + n;
|
||||||
|
|
||||||
let mut overflow = HashMap::with_capacity(n_overflow);
|
let mut overflow = HashMap::with_capacity(n_overflow);
|
||||||
for i in 0..n_overflow {
|
for i in 0..n_overflow {
|
||||||
let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
|
let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
|
||||||
overflow.insert(slot, value);
|
overflow.insert(slot, value);
|
||||||
}
|
}
|
||||||
|
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
|
||||||
Ok(Self {
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
overflow,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the value at the given slot, handling overflow if necessary.
|
|
||||||
pub fn get(&self, slot: usize) -> u32 {
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
match self.mmap[HEADER_SIZE + slot] {
|
match self.mmap[HEADER_SIZE + slot] {
|
||||||
255 => *self
|
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||||
.overflow
|
|
||||||
.get(&slot)
|
|
||||||
.expect("sentinel without overflow entry"),
|
|
||||||
v => v as u32,
|
v => v as u32,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -111,15 +69,189 @@ impl PersistentCompactIntVecBuilder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize {
|
pub fn len(&self) -> usize { self.n }
|
||||||
self.n
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
|
||||||
|
pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||||
|
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
||||||
|
pub fn clear_overflow(&mut self) { self.overflow.clear(); }
|
||||||
|
|
||||||
|
pub fn sum(&self) -> u64 {
|
||||||
|
byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
|
||||||
|
}
|
||||||
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
|
byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_empty(&self) -> bool {
|
pub fn view(&self) -> IntSliceView<'_> {
|
||||||
self.n == 0
|
// Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
|
||||||
|
// by collecting into a sorted vec and storing in a thread-local buffer.
|
||||||
|
// For read-back during building, just call get(slot) directly.
|
||||||
|
// view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
|
||||||
|
// Here we expose it via a zero-alloc path: primary only, no overflow raw.
|
||||||
|
// Callers that need overflow_entries during building use overflow_entries().
|
||||||
|
let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
|
||||||
|
IntSliceView::new(primary, &[], 0, self.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
||||||
|
self.overflow.iter().map(|(&k, &v)| (k, v))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn inc(&mut self, slot: usize) {
|
||||||
|
let v = self.get(slot);
|
||||||
|
self.set(slot, v.saturating_add(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Computation methods ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Increment one counter per 1-bit of `col`. Safe for any group size.
|
||||||
|
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for (wi, &word) in col.words().iter().enumerate() {
|
||||||
|
if word == 0 { continue; }
|
||||||
|
let mut w = word;
|
||||||
|
while w != 0 {
|
||||||
|
let bit = w.trailing_zeros() as usize;
|
||||||
|
let slot = wi * 64 + bit;
|
||||||
|
if slot < n { self.inc(slot); }
|
||||||
|
w &= w - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
|
||||||
|
/// Caller guarantees no counter will reach 255 (group size < 255).
|
||||||
|
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||||
|
{
|
||||||
|
let primary = self.primary_bytes_mut();
|
||||||
|
let n = primary.len();
|
||||||
|
for (wi, &word) in col.words().iter().enumerate() {
|
||||||
|
if word == 0 { continue; }
|
||||||
|
let mut w = word;
|
||||||
|
while w != 0 {
|
||||||
|
let bit = w.trailing_zeros() as usize;
|
||||||
|
let s = wi * 64 + bit;
|
||||||
|
if s < n { primary[s] += 1; }
|
||||||
|
w &= w - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug_assert!(
|
||||||
|
!self.primary_bytes().contains(&255),
|
||||||
|
"sentinel 255 reached in inc_present_fast — group size must be < 255"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Two-pass: primary bytes then overflow. Increments `self[slot]` for each
|
||||||
|
/// slot where `pred(col[slot])` is true. Safe for any group size.
|
||||||
|
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
let n = col.len();
|
||||||
|
for slot in 0..n {
|
||||||
|
let b = col.primary_bytes()[slot];
|
||||||
|
if b < 255 && pred(b as u32) {
|
||||||
|
self.inc(slot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { self.inc(slot); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255.
|
||||||
|
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
let n = col.len();
|
||||||
|
{
|
||||||
|
let primary = self.primary_bytes_mut();
|
||||||
|
for slot in 0..n {
|
||||||
|
let b = col.primary_bytes()[slot];
|
||||||
|
if b < 255 && pred(b as u32) {
|
||||||
|
primary[slot] += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) { self.primary_bytes_mut()[slot] += 1; }
|
||||||
|
}
|
||||||
|
debug_assert!(
|
||||||
|
!self.primary_bytes().contains(&255),
|
||||||
|
"sentinel 255 reached in inc_predicate_fast — group size must be < 255"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for s in 0..n {
|
||||||
|
let sb = self.primary_bytes()[s];
|
||||||
|
let ob = other.primary_bytes()[s];
|
||||||
|
if sb < 255 && ob < 255 {
|
||||||
|
let sum = sb as u32 + ob as u32;
|
||||||
|
if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
|
||||||
|
else { self.set(s, sum); }
|
||||||
|
} else {
|
||||||
|
let sv = self.get(s);
|
||||||
|
let ov = other.get(s);
|
||||||
|
self.set(s, sv + ov);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn min(&mut self, other: IntSliceView<'_>) {
|
||||||
|
let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
|
||||||
|
let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
|
||||||
|
self.clear_overflow();
|
||||||
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||||
|
if b < *a { *a = b; }
|
||||||
|
}
|
||||||
|
for (slot, self_val) in self_ov {
|
||||||
|
if let Some(&other_val) = other_ov.get(&slot) {
|
||||||
|
self.set(slot, self_val.min(other_val));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn max(&mut self, other: IntSliceView<'_>) {
|
||||||
|
for (slot, other_val) in other.overflow_entries() {
|
||||||
|
let sv = self.get(slot);
|
||||||
|
self.set(slot, sv.max(other_val));
|
||||||
|
}
|
||||||
|
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
||||||
|
if b > *a { *a = b; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn diff(&mut self, other: IntSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for s in 0..n {
|
||||||
|
let sb = self.primary_bytes()[s];
|
||||||
|
let ob = other.primary_bytes()[s];
|
||||||
|
if sb < 255 {
|
||||||
|
self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
|
||||||
|
} else {
|
||||||
|
let sv = self.get(s);
|
||||||
|
let ov = if ob < 255 { ob as u32 } else { other.get(s) };
|
||||||
|
self.set(s, sv.saturating_sub(ov));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||||
|
let n = self.n;
|
||||||
|
for (wi, &word) in mask.words().iter().enumerate() {
|
||||||
|
if word == u64::MAX { continue; }
|
||||||
|
let mut zeros = !word;
|
||||||
|
while zeros != 0 {
|
||||||
|
let bit = zeros.trailing_zeros() as usize;
|
||||||
|
let s = wi * 64 + bit;
|
||||||
|
if s < n {
|
||||||
|
let b = self.primary_bytes()[s];
|
||||||
|
if b != 0 { self.set(s, 0); }
|
||||||
|
}
|
||||||
|
zeros &= zeros - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(self) -> io::Result<()> {
|
||||||
self.mmap.flush()?;
|
self.mmap.flush()?;
|
||||||
let Self { path, mmap, n, overflow } = self;
|
let Self { path, mmap, n, overflow } = self;
|
||||||
@@ -129,35 +261,9 @@ impl PersistentCompactIntVecBuilder {
|
|||||||
finalize_pciv(&path, n, &entries)
|
finalize_pciv(&path, n, &entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Close and reopen as a read-only [`PersistentCompactIntVec`].
|
|
||||||
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
|
||||||
let path = self.path.clone();
|
let path = self.path.clone();
|
||||||
self.close()?;
|
self.close()?;
|
||||||
PersistentCompactIntVec::open(&path)
|
PersistentCompactIntVec::open(&path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
use crate::traits::{IntSlice, IntSliceMut};
|
|
||||||
|
|
||||||
impl IntSlice for PersistentCompactIntVecBuilder {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn get(&self, slot: usize) -> u32 { self.get(slot) }
|
|
||||||
fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
self.overflow.iter().map(|(&k, &v)| (k, v))
|
|
||||||
}
|
|
||||||
fn sum(&self) -> u64 {
|
|
||||||
byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
|
|
||||||
}
|
|
||||||
fn count_nonzero(&self) -> u64 {
|
|
||||||
byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IntSliceMut for PersistentCompactIntVecBuilder {
|
|
||||||
fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); }
|
|
||||||
fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
|
|
||||||
fn clear_overflow(&mut self) { self.overflow.clear(); }
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use crate::memoryvec::MemoryBitVec;
|
|
||||||
use crate::tempbitvec::TempBitVec;
|
use crate::tempbitvec::TempBitVec;
|
||||||
use crate::tempintvec::TempCompactIntVec;
|
use crate::tempintvec::TempCompactIntVec;
|
||||||
use crate::traits::BitSlice;
|
|
||||||
|
|
||||||
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
// ── ColGroup ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -41,22 +39,3 @@ pub trait MatrixGroupOps {
|
|||||||
/// Per-slot OR: true if any group column has value ≥ `threshold`.
|
/// Per-slot OR: true if any group column has value ≥ `threshold`.
|
||||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Internal helper ───────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/// Iterate 1-bits of a `MemoryBitVec` and increment the corresponding raw
|
|
||||||
/// byte. Caller must guarantee that no counter will reach 255 (group size
|
|
||||||
/// < 255 columns), so that incrementing `u8` is safe and no sentinel is
|
|
||||||
/// accidentally written.
|
|
||||||
pub(crate) fn inc_primary_bits(primary: &mut [u8], mask: &MemoryBitVec) {
|
|
||||||
let n = primary.len();
|
|
||||||
for (wi, &word) in mask.words().iter().enumerate() {
|
|
||||||
let mut w = word;
|
|
||||||
while w != 0 {
|
|
||||||
let bit = w.trailing_zeros() as usize;
|
|
||||||
let s = wi * 64 + bit;
|
|
||||||
if s < n { primary[s] += 1; }
|
|
||||||
w &= w - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
use std::cmp::Ordering;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, BufWriter, Write as _};
|
use std::io::{self, BufWriter, Write as _};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
@@ -10,14 +8,13 @@ use rayon::prelude::*;
|
|||||||
|
|
||||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||||
use crate::builder::PersistentCompactIntVecBuilder;
|
use crate::builder::PersistentCompactIntVecBuilder;
|
||||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
use crate::colgroup::{ColGroup, MatrixGroupOps};
|
||||||
use crate::memoryintvec::MemoryIntVec;
|
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||||
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
|
||||||
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
|
||||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
|
||||||
use crate::meta::MatrixMeta;
|
use crate::meta::MatrixMeta;
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
|
use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
|
use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
|
use crate::views::IntSliceView;
|
||||||
|
|
||||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||||
dir.join(format!("col_{col:06}.pciv"))
|
dir.join(format!("col_{col:06}.pciv"))
|
||||||
@@ -48,9 +45,7 @@ impl ColumnarCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
for (c, col) in self.cols.iter().enumerate() {
|
for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
|
||||||
buf[c] = col.get(slot);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
@@ -72,31 +67,22 @@ impl ColumnarCompactIntMatrix {
|
|||||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||||
}
|
}
|
||||||
|
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
|
||||||
&self, threshold: u32,
|
|
||||||
) -> (Array2<u64>, Array2<u64>) {
|
|
||||||
pairwise2_matrix(self.n_cols(), |i, j| {
|
|
||||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols(), |i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols(), |i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols(), |i, j| {
|
pairwise_matrix(self.n_cols(), |i, j| {
|
||||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||||
@@ -111,7 +97,6 @@ impl ColumnarCompactIntMatrix {
|
|||||||
meta.n_cols += 1;
|
meta.n_cols += 1;
|
||||||
meta.save(dir)
|
meta.save(dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||||
@@ -119,153 +104,12 @@ impl ColumnarCompactIntMatrix {
|
|||||||
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
const PCMX_MAGIC: [u8; 4] = *b"PCMX";
|
||||||
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)
|
||||||
|
|
||||||
/// Per-column metadata pre-parsed from the embedded PCIV header.
|
|
||||||
struct ColInfo {
|
struct ColInfo {
|
||||||
primary_start: usize, // absolute mmap offset to primary array
|
primary_start: usize,
|
||||||
data_offset: usize, // absolute mmap offset to overflow array
|
data_offset: usize,
|
||||||
n_overflow: usize,
|
n_overflow: usize,
|
||||||
step: usize,
|
|
||||||
index: Vec<(usize, usize)>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── PackedIntCol — lightweight column view backed by the shared mmap ──────────
|
|
||||||
|
|
||||||
pub(crate) struct PackedIntCol<'a> {
|
|
||||||
primary: &'a [u8],
|
|
||||||
overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE
|
|
||||||
n_overflow: usize,
|
|
||||||
step: usize,
|
|
||||||
index: &'a [(usize, usize)],
|
|
||||||
n: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl PackedIntCol<'_> {
|
|
||||||
fn overflow_get(&self, slot: usize) -> u32 {
|
|
||||||
let (pos_start, pos_end) = if self.step == 0 {
|
|
||||||
(0, self.n_overflow)
|
|
||||||
} else {
|
|
||||||
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
|
||||||
let start = self.index[i].1;
|
|
||||||
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
|
||||||
(start, end)
|
|
||||||
};
|
|
||||||
let mut lo = pos_start;
|
|
||||||
let mut hi = pos_end;
|
|
||||||
while lo < hi {
|
|
||||||
let mid = lo + (hi - lo) / 2;
|
|
||||||
let (stored, val) = parse_overflow_entry(self.overflow, 0, mid);
|
|
||||||
match stored.cmp(&slot) {
|
|
||||||
Ordering::Equal => return val,
|
|
||||||
Ordering::Less => lo = mid + 1,
|
|
||||||
Ordering::Greater => hi = mid,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("slot {slot} marked overflow but not found")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IntSlice for PackedIntCol<'_> {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
|
|
||||||
fn get(&self, slot: usize) -> u32 {
|
|
||||||
let v = self.primary[slot];
|
|
||||||
if v < 255 { v as u32 } else { self.overflow_get(slot) }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn primary_bytes(&self) -> &[u8] { self.primary }
|
|
||||||
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
(0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
|
||||||
PackedIntColIter {
|
|
||||||
primary: self.primary,
|
|
||||||
overflow: self.overflow,
|
|
||||||
slot: 0,
|
|
||||||
overflow_pos: 0,
|
|
||||||
n: self.n,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn sum(&self) -> u64 {
|
|
||||||
byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) }
|
|
||||||
}
|
|
||||||
|
|
||||||
struct PackedIntColIter<'a> {
|
|
||||||
primary: &'a [u8],
|
|
||||||
overflow: &'a [u8],
|
|
||||||
slot: usize,
|
|
||||||
overflow_pos: usize,
|
|
||||||
n: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Iterator for PackedIntColIter<'_> {
|
|
||||||
type Item = u32;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
|
||||||
if self.slot >= self.n { return None; }
|
|
||||||
let v = self.primary[self.slot];
|
|
||||||
self.slot += 1;
|
|
||||||
if v < 255 {
|
|
||||||
Some(v as u32)
|
|
||||||
} else {
|
|
||||||
let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos);
|
|
||||||
self.overflow_pos += 1;
|
|
||||||
Some(val)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
|
||||||
let rem = self.n - self.slot;
|
|
||||||
(rem, Some(rem))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ExactSizeIterator for PackedIntColIter<'_> {}
|
|
||||||
|
|
||||||
// ── IntColView — uniform column access across Columnar and Packed ─────────────
|
|
||||||
|
|
||||||
enum IntColViewInner<'a> {
|
|
||||||
Columnar(&'a PersistentCompactIntVec),
|
|
||||||
Packed(PackedIntCol<'a>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`].
|
|
||||||
/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats.
|
|
||||||
pub struct IntColView<'a>(IntColViewInner<'a>);
|
|
||||||
|
|
||||||
impl IntSlice for IntColView<'_> {
|
|
||||||
fn len(&self) -> usize {
|
|
||||||
match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() }
|
|
||||||
}
|
|
||||||
fn get(&self, slot: usize) -> u32 {
|
|
||||||
match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) }
|
|
||||||
}
|
|
||||||
fn primary_bytes(&self) -> &[u8] {
|
|
||||||
match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() }
|
|
||||||
}
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
// Box<dyn Iterator> implements Iterator, satisfying RPITIT across two distinct types.
|
|
||||||
let it: Box<dyn Iterator<Item = (usize, u32)> + '_> = match &self.0 {
|
|
||||||
IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()),
|
|
||||||
IntColViewInner::Packed(c) => Box::new(c.overflow_entries()),
|
|
||||||
};
|
|
||||||
it
|
|
||||||
}
|
|
||||||
fn sum(&self) -> u64 {
|
|
||||||
match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() }
|
|
||||||
}
|
|
||||||
fn count_nonzero(&self) -> u64 {
|
|
||||||
match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
pub struct PackedCompactIntMatrix {
|
pub struct PackedCompactIntMatrix {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
n_rows: usize,
|
n_rows: usize,
|
||||||
@@ -289,52 +133,30 @@ impl PackedCompactIntMatrix {
|
|||||||
for c in 0..n_cols {
|
for c in 0..n_cols {
|
||||||
let off_pos = PCMX_HEADER + c * 8;
|
let off_pos = PCMX_HEADER + c * 8;
|
||||||
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
|
||||||
// Parse embedded PCIV header at col_base
|
|
||||||
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
|
||||||
let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
|
|
||||||
let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
|
|
||||||
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize;
|
||||||
|
|
||||||
let primary_start = col_base + HEADER_SIZE;
|
let primary_start = col_base + HEADER_SIZE;
|
||||||
let data_offset = primary_start + n_pciv;
|
let data_offset = primary_start + n_pciv;
|
||||||
let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
|
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
|
||||||
|
|
||||||
let mut index = Vec::with_capacity(n_idx);
|
|
||||||
for i in 0..n_idx {
|
|
||||||
index.push(parse_index_entry(&mmap, index_offset, i));
|
|
||||||
}
|
}
|
||||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> {
|
pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||||
let ci = &self.columns[c];
|
let ci = &self.columns[c];
|
||||||
PackedIntCol {
|
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||||
primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows],
|
let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||||
overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE],
|
IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
|
||||||
n_overflow: ci.n_overflow,
|
|
||||||
step: ci.step,
|
|
||||||
index: &ci.index,
|
|
||||||
n: self.n_rows,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let col = self.col_slice(c);
|
let view = self.col_view(c);
|
||||||
let overflow: HashMap<usize, u32> = col.overflow_entries().collect();
|
let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
|
||||||
PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path)
|
PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
|
||||||
MemoryIntVec::from(&self.col_slice(c))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
|
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
|
||||||
self.col_slice(col).get(slot)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
|
||||||
@@ -346,86 +168,61 @@ impl PackedCompactIntMatrix {
|
|||||||
|
|
||||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter()
|
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
|
||||||
.map(|c| self.col_slice(c).sum())
|
|
||||||
.collect()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||||
Array1::from_vec(
|
Array1::from_vec(
|
||||||
(0..self.n_cols).into_par_iter()
|
(0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
|
||||||
.map(|c| self.col_slice(c).count_nonzero())
|
|
||||||
.collect()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Pair primitives — sequential scan via col_slice().iter() ─────────────
|
|
||||||
|
|
||||||
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
.map(|(a, b)| a.min(b) as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let ap = a >= t;
|
let ap = a >= t; let bp = b >= t;
|
||||||
let bp = b >= t;
|
|
||||||
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj))
|
.map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d })
|
.map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
|
||||||
if si == 0.0 || sj == 0.0 { return 0.0; }
|
if si == 0.0 || sj == 0.0 { return 0.0; }
|
||||||
self.col_slice(i).iter().zip(self.col_slice(j).iter())
|
self.col_view(i).iter().zip(self.col_view(j).iter())
|
||||||
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d })
|
.map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||||
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||||
}
|
}
|
||||||
@@ -435,32 +232,21 @@ impl PackedCompactIntMatrix {
|
|||||||
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
||||||
let packed_path = dir.join("matrix.pcmx");
|
let packed_path = dir.join("matrix.pcmx");
|
||||||
if packed_path.exists() {
|
if packed_path.exists() {
|
||||||
// Matrix complete; remove any leftover column files from a killed cleanup.
|
|
||||||
if let Ok(meta) = MatrixMeta::load(dir) {
|
if let Ok(meta) = MatrixMeta::load(dir) {
|
||||||
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
|
||||||
let _ = fs::remove_file(dir.join("meta.json"));
|
let _ = fs::remove_file(dir.join("meta.json"));
|
||||||
}
|
}
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
let meta = MatrixMeta::load(dir)?;
|
let meta = MatrixMeta::load(dir)?;
|
||||||
let n_cols = meta.n_cols;
|
let n_cols = meta.n_cols;
|
||||||
|
|
||||||
// Compute offsets from file sizes — no column data loaded into RAM.
|
|
||||||
let col_sizes: Vec<u64> = (0..n_cols)
|
let col_sizes: Vec<u64> = (0..n_cols)
|
||||||
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
.map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
|
||||||
.collect::<io::Result<_>>()?;
|
.collect::<io::Result<_>>()?;
|
||||||
|
|
||||||
let header_size = (PCMX_HEADER + n_cols * 8) as u64;
|
let header_size = (PCMX_HEADER + n_cols * 8) as u64;
|
||||||
let mut col_offset = header_size;
|
let mut col_offset = header_size;
|
||||||
let mut offsets = Vec::with_capacity(n_cols);
|
let mut offsets = Vec::with_capacity(n_cols);
|
||||||
for &size in &col_sizes {
|
for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
|
||||||
offsets.push(col_offset);
|
|
||||||
col_offset += size;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write to a temp file; rename atomically so a killed process never leaves
|
|
||||||
// a truncated matrix.pcmx that would be mistaken for a complete file.
|
|
||||||
let tmp_path = dir.join("matrix.pcmx.tmp");
|
let tmp_path = dir.join("matrix.pcmx.tmp");
|
||||||
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
let mut out = BufWriter::new(File::create(&tmp_path)?);
|
||||||
out.write_all(&PCMX_MAGIC)?;
|
out.write_all(&PCMX_MAGIC)?;
|
||||||
@@ -468,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
|
|||||||
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
out.write_all(&(meta.n as u64).to_le_bytes())?;
|
||||||
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
out.write_all(&(n_cols as u64).to_le_bytes())?;
|
||||||
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
|
||||||
for c in 0..n_cols {
|
for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
|
||||||
io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
|
|
||||||
}
|
|
||||||
out.flush()?;
|
out.flush()?;
|
||||||
drop(out);
|
drop(out);
|
||||||
fs::rename(&tmp_path, &packed_path)?;
|
fs::rename(&tmp_path, &packed_path)?;
|
||||||
|
|
||||||
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
|
||||||
fs::remove_file(dir.join("meta.json"))?;
|
fs::remove_file(dir.join("meta.json"))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -488,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntMatrix {
|
impl PersistentCompactIntMatrix {
|
||||||
/// Open from `layer_dir`, auto-detecting Packed or Columnar.
|
|
||||||
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
pub fn open(layer_dir: &Path) -> io::Result<Self> {
|
||||||
let counts_dir = layer_dir.join("counts");
|
let counts_dir = layer_dir.join("counts");
|
||||||
|
|
||||||
if counts_dir.join("matrix.pcmx").exists() {
|
if counts_dir.join("matrix.pcmx").exists() {
|
||||||
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
|
||||||
}
|
}
|
||||||
|
|
||||||
if MatrixMeta::load(&counts_dir).is_ok() {
|
if MatrixMeta::load(&counts_dir).is_ok() {
|
||||||
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(io::Error::new(
|
Err(io::Error::new(
|
||||||
io::ErrorKind::NotFound,
|
io::ErrorKind::NotFound,
|
||||||
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
|
||||||
@@ -509,7 +288,6 @@ impl PersistentCompactIntMatrix {
|
|||||||
pub fn n(&self) -> usize {
|
pub fn n(&self) -> usize {
|
||||||
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn n_cols(&self) -> usize {
|
pub fn n_cols(&self) -> usize {
|
||||||
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
|
||||||
}
|
}
|
||||||
@@ -521,10 +299,10 @@ impl PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn col_view(&self, c: usize) -> IntColView<'_> {
|
pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
|
||||||
match self {
|
match self {
|
||||||
Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))),
|
Self::Columnar(m) => m.col(c).view(),
|
||||||
Self::Packed(m) => IntColView(IntColViewInner::Packed(m.col_slice(c))),
|
Self::Packed(m) => m.col_view(c),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -535,29 +313,18 @@ impl PersistentCompactIntMatrix {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
|
||||||
match self {
|
|
||||||
Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
|
|
||||||
Self::Packed(m) => m.col_as_memory(c),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||||
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
|
||||||
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sum(&self) -> Array1<u64> {
|
pub fn sum(&self) -> Array1<u64> {
|
||||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_nonzero(&self) -> Array1<u64> {
|
pub fn count_nonzero(&self) -> Array1<u64> {
|
||||||
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||||
}
|
}
|
||||||
@@ -576,7 +343,6 @@ impl PersistentCompactIntMatrix {
|
|||||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||||
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||||
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
ColumnarCompactIntMatrix::append_column(dir, value_of)
|
||||||
}
|
}
|
||||||
@@ -613,16 +379,13 @@ impl PersistentCompactIntMatrixBuilder {
|
|||||||
fs::create_dir_all(dir)?;
|
fs::create_dir_all(dir)?;
|
||||||
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn n(&self) -> usize { self.n }
|
pub fn n(&self) -> usize { self.n }
|
||||||
pub fn n_cols(&self) -> usize { self.n_cols }
|
pub fn n_cols(&self) -> usize { self.n_cols }
|
||||||
|
|
||||||
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||||
let path = col_path(&self.dir, self.n_cols);
|
let path = col_path(&self.dir, self.n_cols);
|
||||||
self.n_cols += 1;
|
self.n_cols += 1;
|
||||||
PersistentCompactIntVecBuilder::new(self.n, &path)
|
PersistentCompactIntVecBuilder::new(self.n, &path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn close(self) -> io::Result<()> {
|
pub fn close(self) -> io::Result<()> {
|
||||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||||
}
|
}
|
||||||
@@ -634,30 +397,20 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
|||||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
|
||||||
let n = self.n();
|
let n = self.n();
|
||||||
if g.indices.len() < 255 {
|
if g.indices.len() < 255 {
|
||||||
// Fast path: counts fit in u8 — accumulate directly into raw bytes.
|
|
||||||
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
let mut builder = TempCompactIntVecBuilder::new(n)?;
|
||||||
{
|
|
||||||
let primary = builder.primary_bytes_mut();
|
|
||||||
for &c in &g.indices {
|
for &c in &g.indices {
|
||||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||||
inc_primary_bits(primary, &mask);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
builder.freeze()
|
builder.freeze()
|
||||||
} else {
|
} else {
|
||||||
// Slow path: chunk by 254 to keep per-chunk u8 safe, then add chunks.
|
|
||||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
for chunk in g.indices.chunks(254) {
|
for chunk in g.indices.chunks(254) {
|
||||||
let mut chunk_builder = TempCompactIntVecBuilder::new(n)?;
|
let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
|
||||||
{
|
|
||||||
let primary = chunk_builder.primary_bytes_mut();
|
|
||||||
for &c in chunk {
|
for &c in chunk {
|
||||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
|
||||||
inc_primary_bits(primary, &mask);
|
|
||||||
}
|
}
|
||||||
}
|
let frozen = chunk_b.freeze()?;
|
||||||
let chunk_frozen = chunk_builder.freeze()?;
|
result.add(frozen.view());
|
||||||
IntSliceMut::add(&mut result, &chunk_frozen);
|
|
||||||
}
|
}
|
||||||
result.freeze()
|
result.freeze()
|
||||||
}
|
}
|
||||||
@@ -666,10 +419,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
|||||||
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
|
||||||
let n = self.n();
|
let n = self.n();
|
||||||
let mut result = TempCompactIntVecBuilder::new(n)?;
|
let mut result = TempCompactIntVecBuilder::new(n)?;
|
||||||
for &c in &g.indices {
|
for &c in &g.indices { result.add(self.col_view(c)); }
|
||||||
let view = self.col_view(c);
|
|
||||||
IntSliceMut::add(&mut result, &view);
|
|
||||||
}
|
|
||||||
result.freeze()
|
result.freeze()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -677,8 +427,7 @@ impl MatrixGroupOps for PersistentCompactIntMatrix {
|
|||||||
let n = self.n();
|
let n = self.n();
|
||||||
let mut result = TempBitVecBuilder::new(n)?;
|
let mut result = TempBitVecBuilder::new(n)?;
|
||||||
for &c in &g.indices {
|
for &c in &g.indices {
|
||||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
result.or_where(self.col_view(c), |v| v >= threshold);
|
||||||
result.or(&mask);
|
|
||||||
}
|
}
|
||||||
result.freeze()
|
result.freeze()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,26 +5,24 @@ mod colgroup;
|
|||||||
mod format;
|
mod format;
|
||||||
mod intmatrix;
|
mod intmatrix;
|
||||||
mod layer_meta;
|
mod layer_meta;
|
||||||
mod memoryintvec;
|
|
||||||
mod memoryvec;
|
|
||||||
mod meta;
|
mod meta;
|
||||||
mod reader;
|
mod reader;
|
||||||
mod tempbitvec;
|
mod tempbitvec;
|
||||||
mod tempintvec;
|
mod tempintvec;
|
||||||
|
mod views;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
|
||||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||||
pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
|
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
|
||||||
pub use builder::PersistentCompactIntVecBuilder;
|
pub use builder::PersistentCompactIntVecBuilder;
|
||||||
pub use colgroup::{ColGroup, MatrixGroupOps};
|
pub use colgroup::{ColGroup, MatrixGroupOps};
|
||||||
pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
||||||
pub use layer_meta::LayerMeta;
|
pub use layer_meta::LayerMeta;
|
||||||
pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
|
|
||||||
pub use memoryvec::MemoryBitVec;
|
|
||||||
pub use reader::PersistentCompactIntVec;
|
pub use reader::PersistentCompactIntVec;
|
||||||
pub use tempbitvec::TempBitVec;
|
pub use tempbitvec::TempBitVec;
|
||||||
pub use tempintvec::TempCompactIntVec;
|
pub use tempintvec::TempCompactIntVec;
|
||||||
pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
|
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||||
|
pub use views::{BitSliceView, IntSliceView};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[path = "tests/mod.rs"]
|
#[path = "tests/mod.rs"]
|
||||||
|
|||||||
@@ -1,186 +0,0 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::io;
|
|
||||||
use std::ops::{Add, AddAssign, Sub, SubAssign};
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use crate::builder::PersistentCompactIntVecBuilder;
|
|
||||||
use crate::format::{byte_count_nonzero, byte_sum};
|
|
||||||
use crate::traits::{IntSlice, IntSliceMut};
|
|
||||||
|
|
||||||
// ── MemoryIntVec ──────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct MemoryIntVec {
|
|
||||||
primary: Vec<u8>,
|
|
||||||
overflow: HashMap<usize, u32>,
|
|
||||||
n: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MemoryIntVec {
|
|
||||||
pub fn new(n: usize) -> Self {
|
|
||||||
Self { primary: vec![0u8; n], overflow: HashMap::new(), n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn len(&self) -> usize { self.n }
|
|
||||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
|
||||||
|
|
||||||
/// Construct directly from a pre-built primary array (no overflow — all values < 255).
|
|
||||||
pub(crate) fn from_primary(primary: Vec<u8>) -> Self {
|
|
||||||
let n = primary.len();
|
|
||||||
Self { primary, overflow: HashMap::new(), n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn from_primary_and_overflow(primary: Vec<u8>, overflow: HashMap<usize, u32>) -> Self {
|
|
||||||
let n = primary.len();
|
|
||||||
Self { primary, overflow, n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
|
|
||||||
pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
|
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> u32 {
|
|
||||||
match self.primary[slot] {
|
|
||||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
|
||||||
v => v as u32,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn sum(&self) -> u64 {
|
|
||||||
byte_sum(&self.primary, self.overflow.values().copied())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn count_nonzero(&self) -> u64 {
|
|
||||||
byte_count_nonzero(&self.primary)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn filled(n: usize, value: u32) -> Self {
|
|
||||||
if value < 255 {
|
|
||||||
Self { primary: vec![value as u8; n], overflow: HashMap::new(), n }
|
|
||||||
} else {
|
|
||||||
Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn iter(&self) -> MemoryIntIter<'_> {
|
|
||||||
MemoryIntIter { vec: self, slot: 0 }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write to disk and return a writable builder at `path`.
|
|
||||||
pub fn persist(&self, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
|
||||||
PersistentCompactIntVecBuilder::from_memory(self, path)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl IntSlice for MemoryIntVec {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn get(&self, slot: usize) -> u32 { self.get(slot) }
|
|
||||||
fn primary_bytes(&self) -> &[u8] { &self.primary }
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
self.overflow.iter().map(|(&k, &v)| (k, v))
|
|
||||||
}
|
|
||||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
|
|
||||||
fn sum(&self) -> u64 { self.sum() }
|
|
||||||
fn count_nonzero(&self) -> u64 { self.count_nonzero() }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl IntSliceMut for MemoryIntVec {
|
|
||||||
fn set(&mut self, slot: usize, value: u32) {
|
|
||||||
if value < 255 {
|
|
||||||
self.primary[slot] = value as u8;
|
|
||||||
self.overflow.remove(&slot);
|
|
||||||
} else {
|
|
||||||
self.primary[slot] = 255;
|
|
||||||
self.overflow.insert(slot, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.primary }
|
|
||||||
fn clear_overflow(&mut self) { self.overflow.clear(); }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── From conversions ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl MemoryIntVec {
|
|
||||||
/// Bulk copy from another `MemoryIntVec`: memcpy for the primary bytes,
|
|
||||||
/// clone for the overflow map.
|
|
||||||
pub fn copy_from_memory(&mut self, src: &MemoryIntVec) {
|
|
||||||
assert_eq!(self.n, src.n, "MemoryIntVec length mismatch");
|
|
||||||
self.primary.copy_from_slice(&src.primary);
|
|
||||||
self.overflow = src.overflow.clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S: IntSlice> From<&S> for MemoryIntVec {
|
|
||||||
fn from(src: &S) -> Self {
|
|
||||||
Self::from_primary_and_overflow(
|
|
||||||
src.primary_bytes().to_vec(),
|
|
||||||
src.overflow_entries().collect(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: IntSlice> Add<&B> for MemoryIntVec {
|
|
||||||
type Output = MemoryIntVec;
|
|
||||||
fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: IntSlice> Sub<&B> for MemoryIntVec {
|
|
||||||
type Output = MemoryIntVec;
|
|
||||||
fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: IntSlice> Add<&B> for &MemoryIntVec {
|
|
||||||
type Output = MemoryIntVec;
|
|
||||||
fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: IntSlice> Sub<&B> for &MemoryIntVec {
|
|
||||||
type Output = MemoryIntVec;
|
|
||||||
fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — in-place assign ────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
|
|
||||||
fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
|
|
||||||
fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Iterator ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
pub struct MemoryIntIter<'a> {
|
|
||||||
vec: &'a MemoryIntVec,
|
|
||||||
slot: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Iterator for MemoryIntIter<'_> {
|
|
||||||
type Item = u32;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
|
||||||
if self.slot >= self.vec.n { return None; }
|
|
||||||
let v = self.vec.get(self.slot);
|
|
||||||
self.slot += 1;
|
|
||||||
Some(v)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
|
||||||
let rem = self.vec.n - self.slot;
|
|
||||||
(rem, Some(rem))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ExactSizeIterator for MemoryIntIter<'_> {}
|
|
||||||
|
|
||||||
impl<'a> IntoIterator for &'a MemoryIntVec {
|
|
||||||
type Item = u32;
|
|
||||||
type IntoIter = MemoryIntIter<'a>;
|
|
||||||
fn into_iter(self) -> MemoryIntIter<'a> { self.iter() }
|
|
||||||
}
|
|
||||||
@@ -1,138 +0,0 @@
|
|||||||
use std::io;
|
|
||||||
use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
|
|
||||||
use std::path::Path;
|
|
||||||
|
|
||||||
use crate::bitvec::{BitIter, PersistentBitVecBuilder, n_words};
|
|
||||||
use crate::traits::{BitSlice, BitSliceMut};
|
|
||||||
|
|
||||||
// ── MemoryBitVec ──────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct MemoryBitVec {
|
|
||||||
words: Vec<u64>,
|
|
||||||
n: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MemoryBitVec {
|
|
||||||
pub fn new(n: usize) -> Self {
|
|
||||||
Self { words: vec![0u64; n_words(n)], n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn ones(n: usize) -> Self {
|
|
||||||
let rem = n % 64;
|
|
||||||
let mut words = vec![u64::MAX; n_words(n)];
|
|
||||||
if rem != 0 {
|
|
||||||
if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; }
|
|
||||||
}
|
|
||||||
Self { words, n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn from_words(words: Vec<u64>, n: usize) -> Self {
|
|
||||||
Self { words, n }
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn len(&self) -> usize { self.n }
|
|
||||||
pub fn is_empty(&self) -> bool { self.n == 0 }
|
|
||||||
|
|
||||||
pub fn get(&self, slot: usize) -> bool {
|
|
||||||
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write to disk and return a writable builder positioned at the same path.
|
|
||||||
pub fn persist(&self, path: &Path) -> io::Result<PersistentBitVecBuilder> {
|
|
||||||
let mut b = PersistentBitVecBuilder::new(self.n, path)?;
|
|
||||||
b.copy_from(self);
|
|
||||||
Ok(b)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl BitSlice for MemoryBitVec {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn words(&self) -> &[u64] { &self.words }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BitSliceMut for MemoryBitVec {
|
|
||||||
fn words_mut(&mut self) -> &mut [u64] { &mut self.words }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── From conversions ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<S: BitSlice> From<&S> for MemoryBitVec {
|
|
||||||
fn from(src: &S) -> Self {
|
|
||||||
Self { words: src.words().to_vec(), n: src.len() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitAnd<&B> for MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitOr<&B> for MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitXor<&B> for MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Not for MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitAnd<&B> for &MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitOr<&B> for &MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitXor<&B> for &MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Not for &MemoryBitVec {
|
|
||||||
type Output = MemoryBitVec;
|
|
||||||
fn not(self) -> MemoryBitVec { !self.clone() }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── std::ops — in-place assign ────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitAndAssign<&B> for MemoryBitVec {
|
|
||||||
fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitOrAssign<&B> for MemoryBitVec {
|
|
||||||
fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
|
|
||||||
fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); }
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Iterator ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
impl MemoryBitVec {
|
|
||||||
pub fn iter(&self) -> BitIter<'_> {
|
|
||||||
BitIter { words: &self.words, slot: 0, n: self.n }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> IntoIterator for &'a MemoryBitVec {
|
|
||||||
type Item = bool;
|
|
||||||
type IntoIter = BitIter<'a>;
|
|
||||||
fn into_iter(self) -> BitIter<'a> { self.iter() }
|
|
||||||
}
|
|
||||||
+55
-211
@@ -5,6 +5,7 @@ use std::path::{Path, PathBuf};
|
|||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
|
|
||||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
||||||
|
use crate::views::IntSliceView;
|
||||||
|
|
||||||
pub struct PersistentCompactIntVec {
|
pub struct PersistentCompactIntVec {
|
||||||
mmap: Mmap,
|
mmap: Mmap,
|
||||||
@@ -18,15 +19,11 @@ pub struct PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PersistentCompactIntVec {
|
impl PersistentCompactIntVec {
|
||||||
/// Opens a persistent compact int vector from the given path.
|
|
||||||
pub fn open(path: &Path) -> io::Result<Self> {
|
pub fn open(path: &Path) -> io::Result<Self> {
|
||||||
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
let mmap = unsafe { Mmap::map(&File::open(path)?)? };
|
||||||
|
|
||||||
if mmap.len() < HEADER_SIZE {
|
if mmap.len() < HEADER_SIZE {
|
||||||
return Err(io::Error::new(
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
|
||||||
io::ErrorKind::InvalidData,
|
|
||||||
"PCIV file too short",
|
|
||||||
));
|
|
||||||
}
|
}
|
||||||
if &mmap[0..4] != &MAGIC {
|
if &mmap[0..4] != &MAGIC {
|
||||||
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
|
||||||
@@ -46,34 +43,13 @@ impl PersistentCompactIntVec {
|
|||||||
index.push(parse_index_entry(&mmap, index_offset, i));
|
index.push(parse_index_entry(&mmap, index_offset, i));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() })
|
||||||
mmap,
|
|
||||||
n,
|
|
||||||
n_overflow,
|
|
||||||
step,
|
|
||||||
index,
|
|
||||||
primary_offset,
|
|
||||||
data_offset,
|
|
||||||
path: path.to_path_buf(),
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the path of the compact int vector file.
|
pub fn path(&self) -> &Path { &self.path }
|
||||||
pub fn path(&self) -> &Path {
|
pub fn len(&self) -> usize { self.n }
|
||||||
&self.path
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the length of the compact int vector.
|
|
||||||
pub fn len(&self) -> usize {
|
|
||||||
self.n
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns whether the compact int vector is empty.
|
|
||||||
pub fn is_empty(&self) -> bool {
|
|
||||||
self.n == 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the value at the given slot.
|
|
||||||
pub fn get(&self, slot: usize) -> u32 {
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
match self.mmap[self.primary_offset + slot] {
|
match self.mmap[self.primary_offset + slot] {
|
||||||
255 => self.overflow_get(slot),
|
255 => self.overflow_get(slot),
|
||||||
@@ -81,27 +57,15 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the value at the given slot from the overflow region.
|
|
||||||
fn overflow_get(&self, slot: usize) -> u32 {
|
fn overflow_get(&self, slot: usize) -> u32 {
|
||||||
let pos_start;
|
let (pos_start, pos_end) = if self.step == 0 {
|
||||||
let pos_end;
|
(0, self.n_overflow)
|
||||||
|
|
||||||
if self.step == 0 {
|
|
||||||
pos_start = 0;
|
|
||||||
pos_end = self.n_overflow;
|
|
||||||
} else {
|
} else {
|
||||||
let i = self
|
let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
|
||||||
.index
|
let start = self.index[i].1;
|
||||||
.partition_point(|&(s, _)| s <= slot)
|
let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
|
||||||
.saturating_sub(1);
|
(start, end)
|
||||||
pos_start = self.index[i].1;
|
|
||||||
pos_end = if i + 1 < self.index.len() {
|
|
||||||
self.index[i + 1].1
|
|
||||||
} else {
|
|
||||||
self.n_overflow
|
|
||||||
};
|
};
|
||||||
}
|
|
||||||
|
|
||||||
let mut lo = pos_start;
|
let mut lo = pos_start;
|
||||||
let mut hi = pos_end;
|
let mut hi = pos_end;
|
||||||
while lo < hi {
|
while lo < hi {
|
||||||
@@ -116,14 +80,12 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the slot at the given index in the overflow region.
|
|
||||||
fn data_slot(&self, i: usize) -> usize {
|
fn data_slot(&self, i: usize) -> usize {
|
||||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||||
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
/// Returns the value at the given index in the overflow region.
|
|
||||||
fn data_value(&self, i: usize) -> u32 {
|
fn data_value(&self, i: usize) -> u32 {
|
||||||
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
|
||||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||||
@@ -139,121 +101,70 @@ impl PersistentCompactIntVec {
|
|||||||
byte_count_nonzero(primary)
|
byte_count_nonzero(primary)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
/// Lightweight zero-copy view — primary and overflow point into the mmap.
|
||||||
/// Returns the Bray-Curtis distance between two compact int vectors.
|
pub fn view(&self) -> IntSliceView<'_> {
|
||||||
|
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||||
|
let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE];
|
||||||
|
IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter(&self) -> Iter<'_> {
|
||||||
|
Iter { pciv: self, slot: 0, overflow_pos: 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Distance methods ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
let sum_min = self.partial_bray_dist(other);
|
let sum_min = self.partial_bray_dist(other);
|
||||||
let denom = self.sum() + other.sum();
|
let denom = self.sum() + other.sum();
|
||||||
if denom == 0 {
|
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
1.0 - 2.0 * sum_min as f64 / denom as f64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
|
|
||||||
/// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
|
|
||||||
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
|
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| a.min(b) as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
|
|
||||||
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
let sum_a = self.sum() as f64;
|
let sa = self.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
return 0.0;
|
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||||
}
|
|
||||||
let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
|
|
||||||
1.0 - sum_min
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
|
pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
|
|
||||||
/// Bray-Curtis distance over a set of vector pairs.
|
|
||||||
///
|
|
||||||
/// Arguments:
|
|
||||||
/// - `other`: the other compact int vector to compare with
|
|
||||||
/// - `sum_a`: the sum of the first vector's counts
|
|
||||||
/// - `sum_b`: the sum of the second vector's counts
|
|
||||||
///
|
|
||||||
/// Returns the sum of the minimum relative frequencies at each index.
|
|
||||||
pub fn partial_relfreq_bray_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
let sum_min: f64 = self
|
self.iter().zip(other.iter())
|
||||||
.iter()
|
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||||
pa.min(pb)
|
pa.min(pb)
|
||||||
})
|
})
|
||||||
.sum();
|
.sum()
|
||||||
sum_min
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the euclidean distance between two compact int vectors.
|
|
||||||
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
self.partial_euclidean_dist(other).sqrt()
|
self.partial_euclidean_dist(other).sqrt()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial euclidean distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
|
|
||||||
/// over a set of vector pairs.
|
|
||||||
///
|
|
||||||
/// The result is the sum of the squared differences between corresponding elements of the two
|
|
||||||
/// vectors.
|
|
||||||
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
.map(|(a, b)| {
|
|
||||||
let d = a as f64 - b as f64;
|
|
||||||
d * d
|
|
||||||
})
|
|
||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the relative frequency euclidean distance between two compact int vectors.
|
|
||||||
///
|
|
||||||
/// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
|
|
||||||
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
let sa = self.sum() as f64;
|
||||||
let sum_a = self.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
|
|
||||||
.sqrt()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial relative frequency euclidean distance between two compact int vectors.
|
pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
|
|
||||||
/// euclidean distance over a set of vector pairs.
|
|
||||||
pub fn partial_relfreq_euclidean_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
|
||||||
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
|
||||||
@@ -263,46 +174,19 @@ impl PersistentCompactIntVec {
|
|||||||
.sum()
|
.sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
|
|
||||||
///
|
|
||||||
/// The Hellinger transform is applied to the raw counts of each vector, and the result is
|
|
||||||
/// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
|
|
||||||
/// as the square root of the relative frequencies.
|
|
||||||
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
let sa = self.sum() as f64;
|
||||||
let sum_a = self.sum() as f64;
|
let sb = other.sum() as f64;
|
||||||
let sum_b = other.sum() as f64;
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
if sum_a == 0.0 && sum_b == 0.0 {
|
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
|
|
||||||
.sqrt()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the partial Hellinger Euclidean distance between two compact int vectors.
|
pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
|
||||||
///
|
|
||||||
/// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
|
|
||||||
/// Euclidean distance over a set of vector pairs.
|
|
||||||
pub fn partial_hellinger_euclidean_dist(
|
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
sum_a: f64,
|
|
||||||
sum_b: f64,
|
|
||||||
) -> f64 {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.map(|(a, b)| {
|
.map(|(a, b)| {
|
||||||
let pa = if sum_a > 0.0 {
|
let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
|
||||||
(a as f64 / sum_a).sqrt()
|
let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
let pb = if sum_b > 0.0 {
|
|
||||||
(b as f64 / sum_b).sqrt()
|
|
||||||
} else {
|
|
||||||
0.0
|
|
||||||
};
|
|
||||||
let d = pa - pb;
|
let d = pa - pb;
|
||||||
d * d
|
d * d
|
||||||
})
|
})
|
||||||
@@ -314,22 +198,13 @@ impl PersistentCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
|
||||||
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||||
if union == 0 {
|
if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 }
|
||||||
return 0.0;
|
|
||||||
}
|
|
||||||
1.0 - intersection as f64 / union as f64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn partial_threshold_jaccard_dist(
|
pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) {
|
||||||
&self,
|
|
||||||
other: &PersistentCompactIntVec,
|
|
||||||
threshold: u32,
|
|
||||||
) -> (u64, u64) {
|
|
||||||
assert_eq!(self.n, other.len(), "length mismatch");
|
assert_eq!(self.n, other.len(), "length mismatch");
|
||||||
self.iter()
|
self.iter().zip(other.iter())
|
||||||
.zip(other.iter())
|
|
||||||
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
let ap = a >= threshold;
|
let ap = a >= threshold;
|
||||||
let bp = b >= threshold;
|
let bp = b >= threshold;
|
||||||
@@ -340,41 +215,12 @@ impl PersistentCompactIntVec {
|
|||||||
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
|
||||||
self.threshold_jaccard_dist(other, 1)
|
self.threshold_jaccard_dist(other, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> Iter<'_> {
|
|
||||||
Iter {
|
|
||||||
pciv: self,
|
|
||||||
slot: 0,
|
|
||||||
overflow_pos: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── IntSlice impl ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
use crate::traits::IntSlice;
|
|
||||||
|
|
||||||
impl IntSlice for PersistentCompactIntVec {
|
|
||||||
fn len(&self) -> usize { self.n }
|
|
||||||
fn get(&self, slot: usize) -> u32 { self.get(slot) }
|
|
||||||
fn primary_bytes(&self) -> &[u8] {
|
|
||||||
&self.mmap[self.primary_offset..self.primary_offset + self.n]
|
|
||||||
}
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
(0..self.n_overflow).map(|i| (self.data_slot(i), self.data_value(i)))
|
|
||||||
}
|
|
||||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ { self.iter() }
|
|
||||||
fn sum(&self) -> u64 { self.sum() }
|
|
||||||
fn count_nonzero(&self) -> u64 { self.count_nonzero() }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
||||||
type Item = u32;
|
type Item = u32;
|
||||||
type IntoIter = Iter<'a>;
|
type IntoIter = Iter<'a>;
|
||||||
|
fn into_iter(self) -> Iter<'a> { self.iter() }
|
||||||
fn into_iter(self) -> Iter<'a> {
|
|
||||||
self.iter()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Iter<'a> {
|
pub struct Iter<'a> {
|
||||||
@@ -389,9 +235,7 @@ impl Iterator for Iter<'_> {
|
|||||||
type Item = u32;
|
type Item = u32;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
fn next(&mut self) -> Option<u32> {
|
||||||
if self.slot >= self.pciv.n {
|
if self.slot >= self.pciv.n { return None; }
|
||||||
return None;
|
|
||||||
}
|
|
||||||
let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
|
let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
|
||||||
self.slot += 1;
|
self.slot += 1;
|
||||||
if v < 255 {
|
if v < 255 {
|
||||||
|
|||||||
@@ -4,13 +4,10 @@ use std::path::Path;
|
|||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||||
use crate::traits::{BitSlice, BitSliceMut};
|
use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
|
||||||
|
|
||||||
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
|
// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
|
||||||
|
|
||||||
/// A bit vector backed by a temporary file.
|
|
||||||
/// Implements [`BitSlice`]; the file is deleted when this value is dropped.
|
|
||||||
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
|
|
||||||
pub struct TempBitVec {
|
pub struct TempBitVec {
|
||||||
vec: PersistentBitVec,
|
vec: PersistentBitVec,
|
||||||
// Dropped after `vec` (field order), so the mmap is released before the
|
// Dropped after `vec` (field order), so the mmap is released before the
|
||||||
@@ -19,25 +16,33 @@ pub struct TempBitVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl TempBitVec {
|
impl TempBitVec {
|
||||||
/// Copy to a permanent file and open as a [`PersistentBitVec`].
|
|
||||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
|
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
|
||||||
std::fs::copy(self.vec.path(), path)?;
|
std::fs::copy(self.vec.path(), path)?;
|
||||||
PersistentBitVec::open(path)
|
PersistentBitVec::open(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(&self) -> usize { self.vec.len() }
|
pub fn len(&self) -> usize {
|
||||||
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
self.vec.len()
|
||||||
}
|
}
|
||||||
|
pub fn is_empty(&self) -> bool {
|
||||||
impl BitSlice for TempBitVec {
|
self.vec.is_empty()
|
||||||
fn len(&self) -> usize { self.vec.len() }
|
}
|
||||||
fn words(&self) -> &[u64] { self.vec.words() }
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
|
self.vec.get(slot)
|
||||||
|
}
|
||||||
|
pub fn count_ones(&self) -> u64 {
|
||||||
|
self.vec.count_ones()
|
||||||
|
}
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
|
self.vec.view()
|
||||||
|
}
|
||||||
|
pub fn iter(&self) -> BitSliceIter<'_> {
|
||||||
|
self.view().iter()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
||||||
|
|
||||||
/// Writable builder for a [`TempBitVec`]. `pub(crate)` — callers receive
|
|
||||||
/// only the frozen result via [`freeze`](Self::freeze).
|
|
||||||
pub(crate) struct TempBitVecBuilder {
|
pub(crate) struct TempBitVecBuilder {
|
||||||
builder: PersistentBitVecBuilder,
|
builder: PersistentBitVecBuilder,
|
||||||
temp: TempDir,
|
temp: TempDir,
|
||||||
@@ -51,19 +56,35 @@ impl TempBitVecBuilder {
|
|||||||
Ok(Self { builder, temp })
|
Ok(Self { builder, temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Finalize writes and return a frozen, read-only [`TempBitVec`].
|
|
||||||
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
|
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
|
||||||
let Self { builder, temp } = self;
|
let Self { builder, temp } = self;
|
||||||
let vec = builder.finish()?;
|
let vec = builder.finish()?;
|
||||||
Ok(TempBitVec { vec, _temp: temp })
|
Ok(TempBitVec { vec, _temp: temp })
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl BitSlice for TempBitVecBuilder {
|
pub fn set(&mut self, slot: usize, value: bool) {
|
||||||
fn len(&self) -> usize { self.builder.len() }
|
self.builder.set(slot, value);
|
||||||
fn words(&self) -> &[u64] { self.builder.words() }
|
}
|
||||||
}
|
pub(crate) fn view(&self) -> BitSliceView<'_> {
|
||||||
|
self.builder.view()
|
||||||
|
}
|
||||||
|
|
||||||
impl BitSliceMut for TempBitVecBuilder {
|
pub fn or(&mut self, other: BitSliceView<'_>) {
|
||||||
fn words_mut(&mut self) -> &mut [u64] { self.builder.words_mut() }
|
self.builder.or(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set self[slot] where pred(col[slot]) is true. Two-pass: primary then overflow.
|
||||||
|
pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
for slot in 0..col.len() {
|
||||||
|
let b = col.primary_bytes()[slot];
|
||||||
|
if b < 255 && pred(b as u32) {
|
||||||
|
self.builder.set(slot, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (slot, val) in col.overflow_entries() {
|
||||||
|
if pred(val) {
|
||||||
|
self.builder.set(slot, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,13 +5,10 @@ use tempfile::TempDir;
|
|||||||
|
|
||||||
use crate::builder::PersistentCompactIntVecBuilder;
|
use crate::builder::PersistentCompactIntVecBuilder;
|
||||||
use crate::reader::PersistentCompactIntVec;
|
use crate::reader::PersistentCompactIntVec;
|
||||||
use crate::traits::{IntSlice, IntSliceMut};
|
use crate::views::{BitSliceView, IntSliceView};
|
||||||
|
|
||||||
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
|
// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
|
||||||
|
|
||||||
/// A compact int vector backed by a temporary file.
|
|
||||||
/// Implements [`IntSlice`]; the file is deleted when this value is dropped.
|
|
||||||
/// Call [`make_persistent`](Self::make_persistent) to promote to a durable file.
|
|
||||||
pub struct TempCompactIntVec {
|
pub struct TempCompactIntVec {
|
||||||
vec: PersistentCompactIntVec,
|
vec: PersistentCompactIntVec,
|
||||||
// Dropped after `vec` (field order), so the mmap is released before the
|
// Dropped after `vec` (field order), so the mmap is released before the
|
||||||
@@ -20,7 +17,6 @@ pub struct TempCompactIntVec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl TempCompactIntVec {
|
impl TempCompactIntVec {
|
||||||
/// Copy to a permanent file and open as a [`PersistentCompactIntVec`].
|
|
||||||
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
|
pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
|
||||||
std::fs::copy(self.vec.path(), path)?;
|
std::fs::copy(self.vec.path(), path)?;
|
||||||
PersistentCompactIntVec::open(path)
|
PersistentCompactIntVec::open(path)
|
||||||
@@ -28,23 +24,14 @@ impl TempCompactIntVec {
|
|||||||
|
|
||||||
pub fn len(&self) -> usize { self.vec.len() }
|
pub fn len(&self) -> usize { self.vec.len() }
|
||||||
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
pub fn is_empty(&self) -> bool { self.vec.is_empty() }
|
||||||
}
|
pub fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
|
||||||
|
pub fn sum(&self) -> u64 { self.vec.sum() }
|
||||||
impl IntSlice for TempCompactIntVec {
|
pub fn view(&self) -> IntSliceView<'_> { self.vec.view() }
|
||||||
fn len(&self) -> usize { self.vec.len() }
|
pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() }
|
||||||
fn get(&self, slot: usize) -> u32 { self.vec.get(slot) }
|
|
||||||
fn primary_bytes(&self) -> &[u8] { self.vec.primary_bytes() }
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
|
||||||
self.vec.overflow_entries()
|
|
||||||
}
|
|
||||||
fn sum(&self) -> u64 { self.vec.sum() }
|
|
||||||
fn count_nonzero(&self) -> u64 { self.vec.count_nonzero() }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
||||||
|
|
||||||
/// Writable builder for a [`TempCompactIntVec`]. `pub(crate)` — callers
|
|
||||||
/// receive only the frozen result via [`freeze`](Self::freeze).
|
|
||||||
pub(crate) struct TempCompactIntVecBuilder {
|
pub(crate) struct TempCompactIntVecBuilder {
|
||||||
builder: PersistentCompactIntVecBuilder,
|
builder: PersistentCompactIntVecBuilder,
|
||||||
temp: TempDir,
|
temp: TempDir,
|
||||||
@@ -58,25 +45,47 @@ impl TempCompactIntVecBuilder {
|
|||||||
Ok(Self { builder, temp })
|
Ok(Self { builder, temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Finalize writes and return a frozen, read-only [`TempCompactIntVec`].
|
|
||||||
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
|
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
|
||||||
let Self { builder, temp } = self;
|
let Self { builder, temp } = self;
|
||||||
let vec = builder.finish()?;
|
let vec = builder.finish()?;
|
||||||
Ok(TempCompactIntVec { vec, _temp: temp })
|
Ok(TempCompactIntVec { vec, _temp: temp })
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl IntSlice for TempCompactIntVecBuilder {
|
// ── Delegation methods ────────────────────────────────────────────────────
|
||||||
fn len(&self) -> usize { self.builder.len() }
|
|
||||||
fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
pub(crate) fn n(&self) -> usize { self.builder.len() }
|
||||||
fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
|
pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
||||||
self.builder.overflow_entries()
|
pub(crate) fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
||||||
|
|
||||||
|
pub(crate) fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
||||||
|
pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
||||||
|
|
||||||
|
pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||||
|
self.builder.inc_present(col);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl IntSliceMut for TempCompactIntVecBuilder {
|
pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||||
fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
self.builder.inc_present_fast(col);
|
||||||
fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
}
|
||||||
fn clear_overflow(&mut self) { self.builder.clear_overflow(); }
|
|
||||||
|
pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.inc_predicate(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
|
self.builder.inc_predicate_fast(col, pred);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
|
||||||
|
self.builder.add(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||||
|
self.builder.mask_with(mask);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
|
||||||
|
pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
|
||||||
|
pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
|
use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||||
use crate::traits::{BitPartials, BitSlice, BitSliceMut};
|
use crate::traits::BitPartials;
|
||||||
|
|
||||||
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::traits::{BitSlice, BitSliceMut};
|
|
||||||
use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||||
|
|
||||||
fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
|
fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
|
||||||
@@ -78,7 +77,7 @@ fn op_and() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.and(&rb);
|
b.and(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
|
||||||
@@ -91,7 +90,7 @@ fn op_or() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.or(&rb);
|
b.or(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
|
||||||
@@ -104,7 +103,7 @@ fn op_xor() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pbiv");
|
let path = dir.path().join("out.pbiv");
|
||||||
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.xor(&rb);
|
b.xor(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentBitVec::open(&path).unwrap();
|
let r = PersistentBitVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
|
||||||
|
|||||||
@@ -5,8 +5,7 @@ use crate::{
|
|||||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
};
|
};
|
||||||
use crate::traits::{BitSlice, BitSliceMut, IntSlice, IntSliceMut};
|
use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||||
use crate::{MemoryBitVec, MemoryIntVec};
|
|
||||||
|
|
||||||
// ── helpers ───────────────────────────────────────────────────────────────────
|
// ── helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
@@ -114,42 +113,52 @@ fn int_partial_group_any() {
|
|||||||
#[test]
|
#[test]
|
||||||
fn mask_with_zeros_selected_slots() {
|
fn mask_with_zeros_selected_slots() {
|
||||||
// count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
|
// count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
|
||||||
let mut v = MemoryIntVec::new(4);
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||||
v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
|
v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
|
||||||
let mut mask = MemoryBitVec::new(4);
|
let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
|
||||||
mask.set(0, true); mask.set(2, true);
|
mask.set(0, true); mask.set(2, true);
|
||||||
v.mask_with(&mask);
|
v.mask_with(mask.view());
|
||||||
assert_eq!(v.get(0), 10);
|
v.close().unwrap();
|
||||||
assert_eq!(v.get(1), 0);
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
assert_eq!(v.get(2), 30);
|
assert_eq!(r.get(0), 10);
|
||||||
assert_eq!(v.get(3), 0);
|
assert_eq!(r.get(1), 0);
|
||||||
|
assert_eq!(r.get(2), 30);
|
||||||
|
assert_eq!(r.get(3), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn mask_with_overflow_slot_zeroed() {
|
fn mask_with_overflow_slot_zeroed() {
|
||||||
// overflow slot (value 500) masked out → removed from overflow, primary=0
|
// overflow slot (value 500) masked out → removed from overflow, primary=0
|
||||||
let mut v = MemoryIntVec::new(3);
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap();
|
||||||
v.set(0, 10); v.set(1, 500); v.set(2, 5);
|
v.set(0, 10); v.set(1, 500); v.set(2, 5);
|
||||||
let mut mask = MemoryBitVec::new(3);
|
let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap();
|
||||||
mask.set(0, true); mask.set(2, true); // slot 1 masked out
|
mask.set(0, true); mask.set(2, true); // slot 1 masked out
|
||||||
v.mask_with(&mask);
|
v.mask_with(mask.view());
|
||||||
assert_eq!(v.get(0), 10);
|
v.close().unwrap();
|
||||||
assert_eq!(v.get(1), 0);
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
assert_eq!(v.get(2), 5);
|
assert_eq!(r.get(0), 10);
|
||||||
let ov: Vec<_> = v.overflow_entries().collect();
|
assert_eq!(r.get(1), 0);
|
||||||
|
assert_eq!(r.get(2), 5);
|
||||||
|
let ov: Vec<_> = r.view().overflow_entries().collect();
|
||||||
assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
|
assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn mask_with_all_ones_is_noop() {
|
fn mask_with_all_ones_is_noop() {
|
||||||
let mut v = MemoryIntVec::new(4);
|
let dir = tempdir().unwrap();
|
||||||
|
let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
|
||||||
v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
|
v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
|
||||||
let mask = MemoryBitVec::ones(4);
|
let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
|
||||||
v.mask_with(&mask);
|
mask.not(); // all bits → 1
|
||||||
assert_eq!(v.get(0), 300);
|
v.mask_with(mask.view());
|
||||||
assert_eq!(v.get(1), 1);
|
v.close().unwrap();
|
||||||
assert_eq!(v.get(2), 0);
|
let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
|
||||||
assert_eq!(v.get(3), 42);
|
assert_eq!(r.get(0), 300);
|
||||||
|
assert_eq!(r.get(1), 1);
|
||||||
|
assert_eq!(r.get(2), 0);
|
||||||
|
assert_eq!(r.get(3), 42);
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
|
// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||||
use crate::traits::{CountPartials, IntSlice};
|
use crate::traits::CountPartials;
|
||||||
|
|
||||||
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
let n = cols.first().map_or(0, |c| c.len());
|
let n = cols.first().map_or(0, |c| c.len());
|
||||||
@@ -290,7 +290,7 @@ fn col_view_packed_matches_columnar() {
|
|||||||
}
|
}
|
||||||
assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
|
assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
|
||||||
let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
|
let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
|
||||||
let mut ov_ref: Vec<(usize, u32)> = col_ref.overflow_entries().collect();
|
let mut ov_ref: Vec<(usize, u32)> = col_ref.view().overflow_entries().collect();
|
||||||
ov_view.sort_unstable_by_key(|&(s, _)| s);
|
ov_view.sort_unstable_by_key(|&(s, _)| s);
|
||||||
ov_ref.sort_unstable_by_key(|&(s, _)| s);
|
ov_ref.sort_unstable_by_key(|&(s, _)| s);
|
||||||
assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
|
assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
|
||||||
|
|||||||
@@ -1,484 +0,0 @@
|
|||||||
use tempfile::tempdir;
|
|
||||||
|
|
||||||
use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit};
|
|
||||||
use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder};
|
|
||||||
|
|
||||||
// ── MemoryBitVec ──────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_new_all_zero() {
|
|
||||||
let v = MemoryBitVec::new(10);
|
|
||||||
assert_eq!(v.len(), 10);
|
|
||||||
assert!(!(0..10).any(|s| v.get(s)));
|
|
||||||
assert_eq!(v.count_ones(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_ones_all_set() {
|
|
||||||
let v = MemoryBitVec::ones(10);
|
|
||||||
assert!((0..10).all(|s| v.get(s)));
|
|
||||||
assert_eq!(v.count_ones(), 10);
|
|
||||||
assert_eq!(v.count_zeros(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_ones_no_padding_leak() {
|
|
||||||
// 5 bits: padding bits in last word must stay 0
|
|
||||||
let v = MemoryBitVec::ones(5);
|
|
||||||
assert_eq!(v.words()[0], 0b11111);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_set_get_roundtrip() {
|
|
||||||
let mut v = MemoryBitVec::new(64);
|
|
||||||
v.set(0, true);
|
|
||||||
v.set(63, true);
|
|
||||||
assert!(v.get(0));
|
|
||||||
assert!(!v.get(1));
|
|
||||||
assert!(v.get(63));
|
|
||||||
assert_eq!(v.count_ones(), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_and() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(1, true);
|
|
||||||
let mut b = MemoryBitVec::new(4);
|
|
||||||
b.set(0, true); b.set(2, true);
|
|
||||||
a.and(&b);
|
|
||||||
assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_or() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(1, true);
|
|
||||||
let mut b = MemoryBitVec::new(4);
|
|
||||||
b.set(0, true); b.set(2, true);
|
|
||||||
a.or(&b);
|
|
||||||
assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_xor() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(1, true);
|
|
||||||
let mut b = MemoryBitVec::new(4);
|
|
||||||
b.set(0, true); b.set(2, true);
|
|
||||||
a.xor(&b);
|
|
||||||
assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_not() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(2, true);
|
|
||||||
a.not();
|
|
||||||
assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_not_no_padding_leak() {
|
|
||||||
let mut v = MemoryBitVec::new(5);
|
|
||||||
v.not();
|
|
||||||
assert_eq!(v.count_ones(), 5);
|
|
||||||
assert_eq!(v.words()[0], 0b11111);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_ops_chaining() {
|
|
||||||
let mut a = MemoryBitVec::ones(8);
|
|
||||||
let b = MemoryBitVec::new(8); // all zeros
|
|
||||||
a.and(&b).or(&b).not();
|
|
||||||
assert_eq!(a.count_ones(), 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_std_ops_owned() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(1, true);
|
|
||||||
let mut b = MemoryBitVec::new(4);
|
|
||||||
b.set(1, true); b.set(2, true);
|
|
||||||
let c = a & &b;
|
|
||||||
assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_std_ops_assign() {
|
|
||||||
let mut a = MemoryBitVec::new(4);
|
|
||||||
a.set(0, true); a.set(1, true);
|
|
||||||
let mut b = MemoryBitVec::new(4);
|
|
||||||
b.set(1, true); b.set(2, true);
|
|
||||||
a &= &b;
|
|
||||||
assert!(!a.get(0)); assert!(a.get(1));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_from_persistent() {
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let path = dir.path().join("v.pbiv");
|
|
||||||
let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap();
|
|
||||||
builder.set(1, true); builder.set(3, true);
|
|
||||||
builder.close().unwrap();
|
|
||||||
let pv = PersistentBitVec::open(&path).unwrap();
|
|
||||||
let mv = MemoryBitVec::from(&pv);
|
|
||||||
assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn mbv_persist_roundtrip() {
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let path = dir.path().join("out.pbiv");
|
|
||||||
let mut v = MemoryBitVec::new(8);
|
|
||||||
v.set(2, true); v.set(5, true);
|
|
||||||
let builder = v.persist(&path).unwrap();
|
|
||||||
builder.close().unwrap();
|
|
||||||
let pv = PersistentBitVec::open(&path).unwrap();
|
|
||||||
assert!(pv.get(2)); assert!(pv.get(5));
|
|
||||||
assert_eq!(pv.count_ones(), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── MemoryIntVec ──────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_new_all_zero() {
|
|
||||||
let v = MemoryIntVec::new(10);
|
|
||||||
assert_eq!(v.len(), 10);
|
|
||||||
assert!((0..10).all(|s| v.get(s) == 0));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_set_get_roundtrip() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(0, 42); v.set(3, 200);
|
|
||||||
assert_eq!(v.get(0), 42);
|
|
||||||
assert_eq!(v.get(1), 0);
|
|
||||||
assert_eq!(v.get(3), 200);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_overflow_roundtrip() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(1, 1000);
|
|
||||||
assert_eq!(v.get(1), 1000);
|
|
||||||
assert_eq!(v.get(0), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_inc_dec() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.inc(2); v.inc(2); v.inc(2);
|
|
||||||
assert_eq!(v.get(2), 3);
|
|
||||||
v.dec(2);
|
|
||||||
assert_eq!(v.get(2), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_dec_saturates_at_zero() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.dec(0);
|
|
||||||
assert_eq!(v.get(0), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_add_at() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.add_at(1, 100); v.add_at(1, 200);
|
|
||||||
assert_eq!(v.get(1), 300);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_min_max() {
|
|
||||||
let mut a = MemoryIntVec::new(4);
|
|
||||||
a.set(0, 5); a.set(1, 2); a.set(2, 8);
|
|
||||||
let mut b = MemoryIntVec::new(4);
|
|
||||||
b.set(0, 3); b.set(1, 7); b.set(2, 8);
|
|
||||||
let mut c = MemoryIntVec::from(&a);
|
|
||||||
IntSliceMut::min(&mut c, &b);
|
|
||||||
assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8);
|
|
||||||
let mut d = MemoryIntVec::from(&a);
|
|
||||||
IntSliceMut::max(&mut d, &b);
|
|
||||||
assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_add_diff() {
|
|
||||||
let mut a = MemoryIntVec::new(3);
|
|
||||||
a.set(0, 10); a.set(1, 5);
|
|
||||||
let mut b = MemoryIntVec::new(3);
|
|
||||||
b.set(0, 3); b.set(1, 8);
|
|
||||||
let mut c = MemoryIntVec::from(&a);
|
|
||||||
c.add(&b);
|
|
||||||
assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
|
|
||||||
let mut d = MemoryIntVec::from(&a);
|
|
||||||
d.diff(&b);
|
|
||||||
assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_std_ops() {
|
|
||||||
let mut a = MemoryIntVec::new(3);
|
|
||||||
a.set(0, 10); a.set(1, 5);
|
|
||||||
let mut b = MemoryIntVec::new(3);
|
|
||||||
b.set(0, 3); b.set(1, 8);
|
|
||||||
let c = &a + &b;
|
|
||||||
assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
|
|
||||||
let d = &a - &b;
|
|
||||||
assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_from_persistent() {
|
|
||||||
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
|
||||||
let dir = tempdir().unwrap();
|
|
||||||
let path = dir.path().join("v.pciv");
|
|
||||||
let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap();
|
|
||||||
b.set(1, 42); b.set(3, 1000);
|
|
||||||
b.close().unwrap();
|
|
||||||
let pv = PersistentCompactIntVec::open(&path).unwrap();
|
|
||||||
let mv = MemoryIntVec::from(&pv);
|
|
||||||
assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Cross-type conversions ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn to_bitvec_threshold() {
|
|
||||||
let mut v = MemoryIntVec::new(5);
|
|
||||||
v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3);
|
|
||||||
let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass
|
|
||||||
assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
|
|
||||||
assert!(bv.get(3)); assert!(!bv.get(4));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn to_presence() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(1, 1); v.set(3, 100);
|
|
||||||
let bv = v.to_presence();
|
|
||||||
assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn to_intvec_roundtrip() {
|
|
||||||
let mut bv = MemoryBitVec::new(8);
|
|
||||||
bv.set(0, true); bv.set(3, true); bv.set(7, true);
|
|
||||||
let iv = bv.to_intvec();
|
|
||||||
assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0);
|
|
||||||
assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn to_intvec_word_boundary() {
|
|
||||||
// 65 bits: spans two words
|
|
||||||
let mut bv = MemoryBitVec::new(65);
|
|
||||||
bv.set(63, true); bv.set(64, true);
|
|
||||||
let iv = bv.to_intvec();
|
|
||||||
assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn count_bits_accumulates() {
|
|
||||||
let mut count = MemoryIntVec::new(8);
|
|
||||||
let mut b1 = MemoryBitVec::new(8);
|
|
||||||
b1.set(0, true); b1.set(2, true);
|
|
||||||
let mut b2 = MemoryBitVec::new(8);
|
|
||||||
b2.set(0, true); b2.set(3, true);
|
|
||||||
let mut b3 = MemoryBitVec::new(8);
|
|
||||||
b3.set(2, true); b3.set(3, true);
|
|
||||||
count.count_bits(&b1).count_bits(&b2).count_bits(&b3);
|
|
||||||
assert_eq!(count.get(0), 2);
|
|
||||||
assert_eq!(count.get(2), 2);
|
|
||||||
assert_eq!(count.get(3), 2);
|
|
||||||
assert_eq!(count.get(1), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn count_bits_skips_zero_words() {
|
|
||||||
// Entire first word is zero — should not touch those slots
|
|
||||||
let mut count = MemoryIntVec::new(128);
|
|
||||||
let mut bv = MemoryBitVec::new(128);
|
|
||||||
bv.set(64, true); bv.set(127, true);
|
|
||||||
count.count_bits(&bv);
|
|
||||||
assert_eq!(count.get(0), 0);
|
|
||||||
assert_eq!(count.get(64), 1);
|
|
||||||
assert_eq!(count.get(127), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── min / max / add / diff — overflow edge cases ──────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_min_overflow_edges() {
|
|
||||||
// [300, 50, 400, 300] min [50, 300, 500, 200]
|
|
||||||
// slot 0: self=overflow(300), other=primary(50) → 50 (overflow removed)
|
|
||||||
// slot 1: self=primary(50), other=overflow(300) → 50 (no overflow created)
|
|
||||||
// slot 2: self=overflow(400), other=overflow(500) → 400 (overflow updated)
|
|
||||||
// slot 3: self=overflow(300), other=primary(200) → 200 (overflow removed, 200 < 255)
|
|
||||||
let mut a = MemoryIntVec::new(4);
|
|
||||||
a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 300);
|
|
||||||
let mut b = MemoryIntVec::new(4);
|
|
||||||
b.set(0, 50); b.set(1, 300); b.set(2, 500); b.set(3, 200);
|
|
||||||
IntSliceMut::min(&mut a, &b);
|
|
||||||
assert_eq!(a.get(0), 50);
|
|
||||||
assert_eq!(a.get(1), 50);
|
|
||||||
assert_eq!(a.get(2), 400);
|
|
||||||
assert_eq!(a.get(3), 200);
|
|
||||||
// Only slot 2 should still have an overflow entry.
|
|
||||||
let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
|
|
||||||
assert_eq!(ov.len(), 1);
|
|
||||||
assert_eq!(ov[&2], 400);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_max_overflow_edges() {
|
|
||||||
// [50, 300, 100, 400] max [300, 50, 500, 200]
|
|
||||||
// slot 0: self=primary(50), other=overflow(300) → 300 (overflow created)
|
|
||||||
// slot 1: self=overflow(300), other=primary(50) → 300 (overflow unchanged)
|
|
||||||
// slot 2: self=primary(100), other=overflow(500) → 500 (overflow created)
|
|
||||||
// slot 3: self=overflow(400), other=overflow(200) → 400 (overflow unchanged, 200 < 255 wait...)
|
|
||||||
// Wait — 200 < 255 so other slot 3 is NOT overflow. Correct: max(400, 200) = 400.
|
|
||||||
let mut a = MemoryIntVec::new(4);
|
|
||||||
a.set(0, 50); a.set(1, 300); a.set(2, 100); a.set(3, 400);
|
|
||||||
let mut b = MemoryIntVec::new(4);
|
|
||||||
b.set(0, 300); b.set(1, 50); b.set(2, 500); b.set(3, 200);
|
|
||||||
IntSliceMut::max(&mut a, &b);
|
|
||||||
assert_eq!(a.get(0), 300);
|
|
||||||
assert_eq!(a.get(1), 300);
|
|
||||||
assert_eq!(a.get(2), 500);
|
|
||||||
assert_eq!(a.get(3), 400);
|
|
||||||
let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
|
|
||||||
assert_eq!(ov.len(), 4); // all four results >= 255
|
|
||||||
assert_eq!(ov[&0], 300);
|
|
||||||
assert_eq!(ov[&1], 300);
|
|
||||||
assert_eq!(ov[&2], 500);
|
|
||||||
assert_eq!(ov[&3], 400);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_add_overflow_edges() {
|
|
||||||
// [300, 50, 400, 200] + [50, 300, 200, 200]
|
|
||||||
// slot 0: self=overflow(300), other=primary(50) → 350 (overflow updated)
|
|
||||||
// slot 1: self=primary(50), other=overflow(300) → 350 (overflow created from primary)
|
|
||||||
// slot 2: self=overflow(400), other=overflow(200... wait 200 < 255)
|
|
||||||
// other slot 2 is primary(200); 400+200=600 (overflow updated)
|
|
||||||
// slot 3: self=primary(200), other=primary(200) → 400 (overflow created, 400 >= 255)
|
|
||||||
let mut a = MemoryIntVec::new(4);
|
|
||||||
a.set(0, 300); a.set(1, 50); a.set(2, 400); a.set(3, 200);
|
|
||||||
let mut b = MemoryIntVec::new(4);
|
|
||||||
b.set(0, 50); b.set(1, 300); b.set(2, 200); b.set(3, 200);
|
|
||||||
a.add(&b);
|
|
||||||
assert_eq!(a.get(0), 350);
|
|
||||||
assert_eq!(a.get(1), 350);
|
|
||||||
assert_eq!(a.get(2), 600);
|
|
||||||
assert_eq!(a.get(3), 400);
|
|
||||||
let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
|
|
||||||
assert_eq!(ov.len(), 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_add_both_overflow() {
|
|
||||||
// [300] + [400] = [700]
|
|
||||||
let mut a = MemoryIntVec::new(1);
|
|
||||||
a.set(0, 300);
|
|
||||||
let mut b = MemoryIntVec::new(1);
|
|
||||||
b.set(0, 400);
|
|
||||||
a.add(&b);
|
|
||||||
assert_eq!(a.get(0), 700);
|
|
||||||
let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
|
|
||||||
assert_eq!(ov[&0], 700);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn miv_diff_overflow_edges() {
|
|
||||||
// [300, 400, 400, 50] - [100, 50, 350, 300]
|
|
||||||
// slot 0: self=overflow(300), other=primary(100) → 200 (overflow removed, 200 < 255)
|
|
||||||
// slot 1: self=overflow(400), other=primary(50) → 350 (overflow updated, 350 >= 255)
|
|
||||||
// slot 2: self=overflow(400), other=overflow(350) → 50 (overflow removed, 50 < 255)
|
|
||||||
// slot 3: self=primary(50), other=overflow(300) → 0 (saturating, stays primary)
|
|
||||||
let mut a = MemoryIntVec::new(4);
|
|
||||||
a.set(0, 300); a.set(1, 400); a.set(2, 400); a.set(3, 50);
|
|
||||||
let mut b = MemoryIntVec::new(4);
|
|
||||||
b.set(0, 100); b.set(1, 50); b.set(2, 350); b.set(3, 300);
|
|
||||||
a.diff(&b);
|
|
||||||
assert_eq!(a.get(0), 200);
|
|
||||||
assert_eq!(a.get(1), 350);
|
|
||||||
assert_eq!(a.get(2), 50);
|
|
||||||
assert_eq!(a.get(3), 0);
|
|
||||||
let ov: std::collections::HashMap<usize, u32> = a.overflow_entries().collect();
|
|
||||||
assert_eq!(ov.len(), 1); // only slot 1 remains overflow
|
|
||||||
assert_eq!(ov[&1], 350);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── Comparison operators ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn cmp_gt() {
|
|
||||||
let mut v = MemoryIntVec::new(5);
|
|
||||||
v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10);
|
|
||||||
let bv = v.gt(3);
|
|
||||||
assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
|
|
||||||
assert!(!bv.get(3)); assert!(bv.get(4));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn cmp_geq() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1);
|
|
||||||
let bv = v.geq(3);
|
|
||||||
assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn cmp_lt() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0);
|
|
||||||
let bv = v.lt(3);
|
|
||||||
assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn cmp_leq() {
|
|
||||||
let mut v = MemoryIntVec::new(4);
|
|
||||||
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3);
|
|
||||||
let bv = v.leq(3);
|
|
||||||
assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn cmp_scalar_with_overflow() {
|
|
||||||
// Slots: [10, 1000, 50, 500, 0]
|
|
||||||
// geq(100): slots 1 (1000) and 3 (500) → both overflow, must qualify
|
|
||||||
// lt(500): slots 0 (10), 2 (50), 4 (0) → primary; slot 1 (1000) → no; slot 3 (500) → no
|
|
||||||
// geq(2000): only slot 1 (1000) fails, no slot qualifies
|
|
||||||
let mut v = MemoryIntVec::new(5);
|
|
||||||
v.set(0, 10); v.set(1, 1000); v.set(2, 50); v.set(3, 500); v.set(4, 0);
|
|
||||||
|
|
||||||
let bv = v.geq(100);
|
|
||||||
assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2));
|
|
||||||
assert!(bv.get(3)); assert!(!bv.get(4));
|
|
||||||
|
|
||||||
let bv = v.lt(500);
|
|
||||||
assert!(bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
|
|
||||||
assert!(!bv.get(3)); assert!(bv.get(4));
|
|
||||||
|
|
||||||
let bv = v.geq(2000);
|
|
||||||
assert!(!(0..5).any(|s| bv.get(s)));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn filter_pattern() {
|
|
||||||
// Typical filter: ingroup >= min_count AND outgroup <= max_outgroup
|
|
||||||
let mut ingroup = MemoryIntVec::new(6);
|
|
||||||
let mut outgroup = MemoryIntVec::new(6);
|
|
||||||
// slot 2: ingroup=3, outgroup=0 → keep
|
|
||||||
// slot 4: ingroup=2, outgroup=1 → drop (outgroup > 0)
|
|
||||||
// slot 5: ingroup=1, outgroup=0 → drop (ingroup < 2)
|
|
||||||
ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1);
|
|
||||||
outgroup.set(4, 1);
|
|
||||||
let out_mask = outgroup.leq(0);
|
|
||||||
let mut in_mask = ingroup.geq(2);
|
|
||||||
let keep = in_mask.and(&out_mask);
|
|
||||||
assert!(!keep.get(0)); assert!(!keep.get(1));
|
|
||||||
assert!(keep.get(2));
|
|
||||||
assert!(!keep.get(4)); assert!(!keep.get(5));
|
|
||||||
}
|
|
||||||
@@ -2,12 +2,9 @@ mod bitmatrix;
|
|||||||
mod bitvec;
|
mod bitvec;
|
||||||
mod colgroup;
|
mod colgroup;
|
||||||
mod intmatrix;
|
mod intmatrix;
|
||||||
mod memoryvec;
|
|
||||||
|
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
use crate::traits::IntSliceMut;
|
|
||||||
|
|
||||||
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
|
||||||
|
|
||||||
fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
|
fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
|
||||||
@@ -173,7 +170,7 @@ fn combine_min() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.min(&rb);
|
b.min(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
|
||||||
@@ -186,7 +183,7 @@ fn combine_max() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.max(&rb);
|
b.max(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
|
||||||
@@ -199,7 +196,7 @@ fn combine_add() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.add(&rb);
|
b.add(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
|
||||||
@@ -224,7 +221,7 @@ fn combine_diff() {
|
|||||||
let dir = tempdir().unwrap();
|
let dir = tempdir().unwrap();
|
||||||
let path = dir.path().join("out.pciv");
|
let path = dir.path().join("out.pciv");
|
||||||
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
|
||||||
b.diff(&rb);
|
b.diff(rb.view());
|
||||||
b.close().unwrap();
|
b.close().unwrap();
|
||||||
let r = PersistentCompactIntVec::open(&path).unwrap();
|
let r = PersistentCompactIntVec::open(&path).unwrap();
|
||||||
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
|
||||||
|
|||||||
@@ -1,353 +1,5 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
use ndarray::{Array1, Array2};
|
use ndarray::{Array1, Array2};
|
||||||
|
|
||||||
// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/// Read-only view over the u64 word array of a bit vector.
|
|
||||||
///
|
|
||||||
/// Bit `i` is in `words()[i >> 6]` at position `i & 63`.
|
|
||||||
/// Padding bits in the last word are zero.
|
|
||||||
pub trait BitSlice {
|
|
||||||
fn len(&self) -> usize;
|
|
||||||
fn words(&self) -> &[u64];
|
|
||||||
fn is_empty(&self) -> bool { self.len() == 0 }
|
|
||||||
fn get(&self, slot: usize) -> bool {
|
|
||||||
(self.words()[slot >> 6] >> (slot & 63)) & 1 != 0
|
|
||||||
}
|
|
||||||
fn count_ones(&self) -> u64 {
|
|
||||||
self.words().iter().map(|w| w.count_ones() as u64).sum()
|
|
||||||
}
|
|
||||||
fn count_zeros(&self) -> u64 { self.len() as u64 - self.count_ones() }
|
|
||||||
fn partial_jaccard_dist<S: BitSlice>(&self, other: &S) -> (u64, u64) {
|
|
||||||
assert_eq!(self.len(), other.len(), "length mismatch");
|
|
||||||
self.words().iter().zip(other.words())
|
|
||||||
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
|
||||||
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
fn jaccard_dist<S: BitSlice>(&self, other: &S) -> f64 {
|
|
||||||
let (inter, union) = self.partial_jaccard_dist(other);
|
|
||||||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
|
||||||
}
|
|
||||||
fn hamming_dist<S: BitSlice>(&self, other: &S) -> u64 {
|
|
||||||
assert_eq!(self.len(), other.len(), "length mismatch");
|
|
||||||
self.words().iter().zip(other.words())
|
|
||||||
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
|
||||||
.sum()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Mutable view over a bit-vector word array; default methods maintain the
|
|
||||||
/// zero-padding invariant on the last word.
|
|
||||||
pub trait BitSliceMut: BitSlice {
|
|
||||||
fn words_mut(&mut self) -> &mut [u64];
|
|
||||||
|
|
||||||
fn set(&mut self, slot: usize, value: bool) {
|
|
||||||
let bit = 1u64 << (slot & 63);
|
|
||||||
if value { self.words_mut()[slot >> 6] |= bit; } else { self.words_mut()[slot >> 6] &= !bit; }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn copy_from<S: BitSlice>(&mut self, src: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), src.len(), "BitSlice length mismatch");
|
|
||||||
self.words_mut().copy_from_slice(src.words());
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn and<S: BitSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
|
|
||||||
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; }
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn or<S: BitSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
|
|
||||||
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; }
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn xor<S: BitSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
|
|
||||||
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; }
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn not(&mut self) -> &mut Self {
|
|
||||||
let rem = self.len() % 64;
|
|
||||||
let words = self.words_mut();
|
|
||||||
for w in words.iter_mut() { *w ^= u64::MAX; }
|
|
||||||
if rem != 0 {
|
|
||||||
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/// Read-only access to a compact integer vector (values encoded as u32).
|
|
||||||
pub trait IntSlice {
|
|
||||||
fn len(&self) -> usize;
|
|
||||||
fn get(&self, slot: usize) -> u32;
|
|
||||||
/// Raw primary byte slice (sentinel 255 marks overflow slots).
|
|
||||||
fn primary_bytes(&self) -> &[u8];
|
|
||||||
/// Iterator over `(slot, true_value)` pairs for all overflow entries (value >= 255).
|
|
||||||
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_;
|
|
||||||
fn is_empty(&self) -> bool { self.len() == 0 }
|
|
||||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
|
|
||||||
fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() }
|
|
||||||
fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 }
|
|
||||||
|
|
||||||
fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) }
|
|
||||||
fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }
|
|
||||||
fn gt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v > threshold) }
|
|
||||||
fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) }
|
|
||||||
|
|
||||||
fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec {
|
|
||||||
let n = self.len();
|
|
||||||
let mut words = vec![0u64; n.div_ceil(64)];
|
|
||||||
let primary = self.primary_bytes();
|
|
||||||
// Pass 1: byte scan — no HashMap access, vectorisable for simple predicates.
|
|
||||||
// Overflow slots (b == 255) are left as 0 and fixed in pass 2.
|
|
||||||
for s in 0..n {
|
|
||||||
let b = primary[s];
|
|
||||||
if b < 255 && pred(b as u32) {
|
|
||||||
words[s >> 6] |= 1u64 << (s & 63);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Pass 2: fix up overflow slots — O(k), negligible.
|
|
||||||
for (s, val) in self.overflow_entries() {
|
|
||||||
if pred(val) { words[s >> 6] |= 1u64 << (s & 63); }
|
|
||||||
}
|
|
||||||
MemoryBitVec::from_words(words, n)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Mutable access; default methods use only `get` / `set` and maintain the
|
|
||||||
/// compact encoding invariants on the implementor's side.
|
|
||||||
pub trait IntSliceMut: IntSlice {
|
|
||||||
fn set(&mut self, slot: usize, value: u32);
|
|
||||||
fn primary_bytes_mut(&mut self) -> &mut [u8];
|
|
||||||
fn clear_overflow(&mut self);
|
|
||||||
|
|
||||||
fn inc(&mut self, slot: usize) -> &mut Self {
|
|
||||||
let v = self.get(slot);
|
|
||||||
self.set(slot, v.saturating_add(1));
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn dec(&mut self, slot: usize) -> &mut Self {
|
|
||||||
let v = self.get(slot);
|
|
||||||
self.set(slot, v.saturating_sub(1));
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self {
|
|
||||||
let v = self.get(slot);
|
|
||||||
self.set(slot, v.saturating_add(delta));
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn copy_from<S: IntSlice>(&mut self, src: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), src.len(), "IntSlice length mismatch");
|
|
||||||
self.primary_bytes_mut().copy_from_slice(src.primary_bytes());
|
|
||||||
self.clear_overflow();
|
|
||||||
for (slot, val) in src.overflow_entries() { self.set(slot, val); }
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn min<S: IntSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
|
|
||||||
// Snapshot both overflow sets (O(k), tiny) before mutating self.
|
|
||||||
// 255 = +∞ on u8, so byte-level min is correct in all cases except
|
|
||||||
// both-overflow: only those slots need a fixup pass.
|
|
||||||
let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
|
|
||||||
let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
|
|
||||||
self.clear_overflow();
|
|
||||||
// Pass 1 — SIMD-vectorizable byte min over the full primary array.
|
|
||||||
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
|
||||||
if b < *a { *a = b; }
|
|
||||||
}
|
|
||||||
// Pass 2 — fixup slots where BOTH sides were overflow (primary = 255 after pass 1,
|
|
||||||
// but the overflow value may have changed). Slots where only self was overflow are
|
|
||||||
// already correct: pass 1 wrote other.primary[slot] < 255 and clear_overflow removed
|
|
||||||
// the stale entry.
|
|
||||||
for (slot, self_val) in self_ov {
|
|
||||||
if let Some(&other_val) = other_ov.get(&slot) {
|
|
||||||
self.set(slot, self_val.min(other_val));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn max<S: IntSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
|
|
||||||
// Pre-pass — process other's overflow entries BEFORE the byte pass.
|
|
||||||
// After the byte pass, self.primary[slot] = 255 for all slots in other_ov,
|
|
||||||
// making it impossible to recover the original self value; we need it now.
|
|
||||||
for (slot, other_val) in other.overflow_entries() {
|
|
||||||
let self_val = self.get(slot);
|
|
||||||
self.set(slot, self_val.max(other_val));
|
|
||||||
}
|
|
||||||
// Pass 1 — SIMD-vectorizable byte max over the full primary array.
|
|
||||||
// 255 = +∞ on u8 → max(a, 255) = 255 is the correct sentinel for all
|
|
||||||
// overflow slots, whether handled by the pre-pass or already in self.
|
|
||||||
for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
|
|
||||||
if b > *a { *a = b; }
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add<S: IntSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
|
|
||||||
let n = self.len();
|
|
||||||
for s in 0..n {
|
|
||||||
// Read both primary bytes first — u8 is Copy, borrows released immediately.
|
|
||||||
let sb = self.primary_bytes()[s];
|
|
||||||
let ob = other.primary_bytes()[s];
|
|
||||||
if sb < 255 && ob < 255 {
|
|
||||||
// Hot path: no overflow lookup, no HashMap write in the common case.
|
|
||||||
let sum = sb as u32 + ob as u32;
|
|
||||||
if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
|
|
||||||
else { self.set(s, sum); }
|
|
||||||
} else {
|
|
||||||
// At least one side is in overflow — get() is unavoidable.
|
|
||||||
let self_val = self.get(s);
|
|
||||||
let other_val = other.get(s);
|
|
||||||
self.set(s, self_val + other_val);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
fn diff<S: IntSlice>(&mut self, other: &S) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
|
|
||||||
let n = self.len();
|
|
||||||
for s in 0..n {
|
|
||||||
let sb = self.primary_bytes()[s];
|
|
||||||
let ob = other.primary_bytes()[s];
|
|
||||||
if sb < 255 {
|
|
||||||
// Result is always < 255 — no overflow created or consulted.
|
|
||||||
// ob == 255 means b ≥ 255 > a, so saturating result = 0.
|
|
||||||
self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
|
|
||||||
} else {
|
|
||||||
// sb == 255: self has overflow — get() unavoidable.
|
|
||||||
// other.get() only needed when ob == 255 too (both-overflow case).
|
|
||||||
let self_val = self.get(s);
|
|
||||||
let other_val = if ob < 255 { ob as u32 } else { other.get(s) };
|
|
||||||
self.set(s, self_val.saturating_sub(other_val));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// For each slot where `bits` is true, increment `self` by 1.
|
|
||||||
/// Skips zero words entirely — O(n_ones) rather than O(n).
|
|
||||||
fn count_bits<B: BitSlice>(&mut self, bits: &B) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch");
|
|
||||||
for (w_idx, &word) in bits.words().iter().enumerate() {
|
|
||||||
if word == 0 { continue; }
|
|
||||||
let base = w_idx * 64;
|
|
||||||
let mut w = word;
|
|
||||||
while w != 0 {
|
|
||||||
let bit = w.trailing_zeros() as usize;
|
|
||||||
let slot = base + bit;
|
|
||||||
if slot < self.len() { self.inc(slot); }
|
|
||||||
w &= w - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Zero every slot where the corresponding bit in `mask` is 0.
|
|
||||||
/// Iterates only the zero bits — O(n_zeros), O(1) when mask is all-ones.
|
|
||||||
fn mask_with<B: BitSlice>(&mut self, mask: &B) -> &mut Self {
|
|
||||||
assert_eq!(self.len(), mask.len(), "IntSlice/BitSlice length mismatch");
|
|
||||||
let n = self.len();
|
|
||||||
for (wi, &word) in mask.words().iter().enumerate() {
|
|
||||||
if word == u64::MAX { continue; }
|
|
||||||
let mut zeros = !word;
|
|
||||||
while zeros != 0 {
|
|
||||||
let bit = zeros.trailing_zeros() as usize;
|
|
||||||
let s = wi * 64 + bit;
|
|
||||||
if s < n {
|
|
||||||
// u8 is Copy — the immutable borrow from primary_bytes() ends
|
|
||||||
// before the mutable borrow from set() begins.
|
|
||||||
let b = self.primary_bytes()[s];
|
|
||||||
if b != 0 { self.set(s, 0); }
|
|
||||||
}
|
|
||||||
zeros &= zeros - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── IntSlice → MemoryBitVec conversions ───────────────────────────────────────
|
|
||||||
|
|
||||||
use crate::memoryvec::MemoryBitVec;
|
|
||||||
|
|
||||||
pub trait IntToBit: IntSlice {
|
|
||||||
/// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`.
|
|
||||||
fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) }
|
|
||||||
|
|
||||||
/// Bit set iff value >= 1 (slot is present).
|
|
||||||
fn to_presence(&self) -> MemoryBitVec { self.geq(1) }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: IntSlice> IntToBit for T {}
|
|
||||||
|
|
||||||
// ── BitSlice → MemoryIntVec conversion ───────────────────────────────────────
|
|
||||||
|
|
||||||
use crate::memoryintvec::MemoryIntVec;
|
|
||||||
|
|
||||||
// Maps each byte value to its 8 constituent bits as individual u8 (0 or 1).
|
|
||||||
static EXPAND_BYTE: [[u8; 8]; 256] = {
|
|
||||||
let mut table = [[0u8; 8]; 256];
|
|
||||||
let mut b = 0usize;
|
|
||||||
while b < 256 {
|
|
||||||
let mut bit = 0usize;
|
|
||||||
while bit < 8 {
|
|
||||||
table[b][bit] = ((b >> bit) & 1) as u8;
|
|
||||||
bit += 1;
|
|
||||||
}
|
|
||||||
b += 1;
|
|
||||||
}
|
|
||||||
table
|
|
||||||
};
|
|
||||||
|
|
||||||
pub trait BitToInt: BitSlice {
|
|
||||||
fn to_intvec(&self) -> MemoryIntVec {
|
|
||||||
let n = self.len();
|
|
||||||
let mut primary = vec![0u8; n];
|
|
||||||
|
|
||||||
let words = self.words();
|
|
||||||
let full_words = n / 64;
|
|
||||||
|
|
||||||
for (w_idx, &word) in words[..full_words].iter().enumerate() {
|
|
||||||
let base = w_idx * 64;
|
|
||||||
for byte_off in 0..8usize {
|
|
||||||
let byte = (word >> (byte_off * 8)) as u8;
|
|
||||||
primary[base + byte_off * 8..base + byte_off * 8 + 8]
|
|
||||||
.copy_from_slice(&EXPAND_BYTE[byte as usize]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let rem = n % 64;
|
|
||||||
if rem > 0 {
|
|
||||||
let word = words[full_words];
|
|
||||||
let base = full_words * 64;
|
|
||||||
for bit in 0..rem {
|
|
||||||
primary[base + bit] = ((word >> bit) & 1) as u8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
MemoryIntVec::from_primary(primary)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: BitSlice> BitToInt for T {}
|
|
||||||
|
|
||||||
// ── Column-level weight statistic — total count or presence count per column.
|
// ── Column-level weight statistic — total count or presence count per column.
|
||||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -0,0 +1,278 @@
|
|||||||
|
use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
|
||||||
|
|
||||||
|
// ── BitSliceView ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight, copy-able read-only view over a u64 word array.
|
||||||
|
/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct BitSliceView<'a> {
|
||||||
|
pub(crate) words: &'a [u64],
|
||||||
|
pub(crate) n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BitSliceView<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.n }
|
||||||
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
pub fn words(&self) -> &'a [u64] { self.words }
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn get(&self, slot: usize) -> bool {
|
||||||
|
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count_ones(&self) -> u64 {
|
||||||
|
self.words.iter().map(|w| w.count_ones() as u64).sum()
|
||||||
|
}
|
||||||
|
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
|
||||||
|
|
||||||
|
pub fn iter(&self) -> BitSliceIter<'a> {
|
||||||
|
BitSliceIter { words: self.words, slot: 0, n: self.n }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
|
||||||
|
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||||
|
self.words.iter().zip(other.words)
|
||||||
|
.fold((0u64, 0u64), |(i, u), (&a, &b)| {
|
||||||
|
(i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
|
||||||
|
let (inter, union) = self.partial_jaccard_dist(other);
|
||||||
|
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
|
||||||
|
assert_eq!(self.n, other.n, "BitSliceView length mismatch");
|
||||||
|
self.words.iter().zip(other.words)
|
||||||
|
.map(|(&a, &b)| (a ^ b).count_ones() as u64)
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── BitSliceIter ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct BitSliceIter<'a> {
|
||||||
|
words: &'a [u64],
|
||||||
|
slot: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for BitSliceIter<'_> {
|
||||||
|
type Item = bool;
|
||||||
|
fn next(&mut self) -> Option<bool> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
|
||||||
|
self.slot += 1;
|
||||||
|
Some(v)
|
||||||
|
}
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl ExactSizeIterator for BitSliceIter<'_> {}
|
||||||
|
|
||||||
|
// ── IntSliceView ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/// Lightweight, copy-able read-only view over a compact-int primary array plus
|
||||||
|
/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap.
|
||||||
|
#[derive(Clone, Copy)]
|
||||||
|
pub struct IntSliceView<'a> {
|
||||||
|
pub(crate) primary: &'a [u8],
|
||||||
|
pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
|
||||||
|
pub(crate) n_overflow: usize,
|
||||||
|
pub(crate) n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> IntSliceView<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
|
||||||
|
Self { primary, overflow_raw, n_overflow, n }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn len(&self) -> usize { self.n }
|
||||||
|
pub fn is_empty(&self) -> bool { self.n == 0 }
|
||||||
|
pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
|
||||||
|
pub fn n_overflow(&self) -> usize { self.n_overflow }
|
||||||
|
|
||||||
|
pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
|
||||||
|
let raw = self.overflow_raw;
|
||||||
|
let n_ov = self.n_overflow;
|
||||||
|
(0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// O(log n_overflow) via binary search (overflow is always sorted by slot).
|
||||||
|
pub fn get(&self, slot: usize) -> u32 {
|
||||||
|
let b = self.primary[slot];
|
||||||
|
if b < 255 { return b as u32; }
|
||||||
|
let mut lo = 0usize;
|
||||||
|
let mut hi = self.n_overflow;
|
||||||
|
while lo < hi {
|
||||||
|
let mid = lo + (hi - lo) / 2;
|
||||||
|
let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
|
||||||
|
match s.cmp(&slot) {
|
||||||
|
std::cmp::Ordering::Equal => return v,
|
||||||
|
std::cmp::Ordering::Less => lo = mid + 1,
|
||||||
|
std::cmp::Ordering::Greater => hi = mid,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic!("slot {slot} marked overflow but not found")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Sequential merge scan: yields all n values in slot order.
|
||||||
|
pub fn iter(&self) -> IntSliceViewIter<'a> {
|
||||||
|
IntSliceViewIter {
|
||||||
|
primary: self.primary,
|
||||||
|
overflow_raw: self.overflow_raw,
|
||||||
|
slot: 0,
|
||||||
|
overflow_pos: 0,
|
||||||
|
n: self.n,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn sum(&self) -> u64 {
|
||||||
|
byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count_nonzero(&self) -> u64 {
|
||||||
|
byte_count_nonzero(self.primary)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── Distance methods ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sum_min = self.partial_bray_dist(other);
|
||||||
|
let denom = self.sum() + other.sum();
|
||||||
|
if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||||
|
pa.min(pb)
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| { let d = a as f64 - b as f64; d * d })
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.partial_euclidean_dist(other).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
|
||||||
|
let d = pa - pb;
|
||||||
|
d * d
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.map(|(a, b)| {
|
||||||
|
let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
|
||||||
|
let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
|
||||||
|
let d = pa - pb;
|
||||||
|
d * d
|
||||||
|
})
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
let sa = self.sum() as f64;
|
||||||
|
let sb = other.sum() as f64;
|
||||||
|
if sa == 0.0 && sb == 0.0 { return 0.0; }
|
||||||
|
self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
|
||||||
|
assert_eq!(self.n, other.n, "length mismatch");
|
||||||
|
self.iter().zip(other.iter())
|
||||||
|
.fold((0u64, 0u64), |(inter, uni), (a, b)| {
|
||||||
|
let ap = a >= threshold;
|
||||||
|
let bp = b >= threshold;
|
||||||
|
(inter + (ap & bp) as u64, uni + (ap | bp) as u64)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
|
||||||
|
let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
|
||||||
|
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
|
||||||
|
self.threshold_jaccard_dist(other, 1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── IntSliceViewIter ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
pub struct IntSliceViewIter<'a> {
|
||||||
|
primary: &'a [u8],
|
||||||
|
overflow_raw: &'a [u8],
|
||||||
|
slot: usize,
|
||||||
|
overflow_pos: usize,
|
||||||
|
n: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for IntSliceViewIter<'_> {
|
||||||
|
type Item = u32;
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.slot >= self.n { return None; }
|
||||||
|
let v = self.primary[self.slot];
|
||||||
|
self.slot += 1;
|
||||||
|
if v < 255 {
|
||||||
|
Some(v as u32)
|
||||||
|
} else {
|
||||||
|
let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
|
||||||
|
self.overflow_pos += 1;
|
||||||
|
Some(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||||
|
let rem = self.n - self.slot;
|
||||||
|
(rem, Some(rem))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl ExactSizeIterator for IntSliceViewIter<'_> {}
|
||||||
@@ -3,7 +3,6 @@ use std::io;
|
|||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
|
use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
|
||||||
use obicompactvec::traits::BitSliceMut;
|
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::{IndexMode, OLMError};
|
use obilayeredmap::{IndexMode, OLMError};
|
||||||
use obiskio::{SKError, SKResult};
|
use obiskio::{SKError, SKResult};
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ use obicompactvec::{
|
|||||||
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder,
|
||||||
};
|
};
|
||||||
use obicompactvec::traits::BitSliceMut;
|
|
||||||
use obilayeredmap::meta::PartitionMeta;
|
use obilayeredmap::meta::PartitionMeta;
|
||||||
use obilayeredmap::OLMError;
|
use obilayeredmap::OLMError;
|
||||||
use obiskio::{SKError, SKResult};
|
use obiskio::{SKError, SKResult};
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ use obicompactvec::{
|
|||||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
};
|
};
|
||||||
use obicompactvec::traits::BitSliceMut;
|
|
||||||
use obikseq::CanonicalKmer;
|
use obikseq::CanonicalKmer;
|
||||||
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
use obiskio::{UnitigFileReader, UnitigFileWriter};
|
||||||
|
|
||||||
|
|||||||
@@ -102,7 +102,6 @@ mod tests {
|
|||||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||||
};
|
};
|
||||||
use obicompactvec::traits::BitSliceMut;
|
|
||||||
use tempfile::tempdir;
|
use tempfile::tempdir;
|
||||||
|
|
||||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||||
|
|||||||
Reference in New Issue
Block a user