feat: add memory vectors, slice traits, and column extraction methods

Introduce `MemoryBitVec` and `MemoryIntVec` for efficient in-memory storage with hybrid compression and overflow handling. Implement `BitSlice`, `BitSliceMut`, `IntSlice`, and `IntSliceMut` traits across persistent and memory-backed types to enable generic slice operations and bitwise/arithmetic overloads. Add `col_persist` and `col_as_memory` methods to `BitMatrix` and `IntMatrix` for efficient column extraction. Align with the new single-pass rebuild architecture by supporting fast kmer filtering and matrix rebuilding. Includes comprehensive tests and profiling instrumentation for the packing phase.
This commit is contained in:
Eric Coissac
2026-06-16 23:18:10 +02:00
parent b6fcbc545f
commit cde6457eea
15 changed files with 1120 additions and 70 deletions
+44
View File
@@ -7,6 +7,8 @@ use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
use crate::memoryvec::MemoryBitVec;
use crate::traits::BitSliceMut;
use crate::layer_meta::LayerMeta;
use crate::meta::MatrixMeta;
@@ -154,6 +156,28 @@ impl PackedBitMatrix {
&self.mmap[start..start + len]
}
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
}
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryBitVec {
let bytes = self.col_bytes(c);
let n = self.n_rows;
let n_words = n.div_ceil(64);
let mut words = vec![0u64; n_words];
let full = bytes.len() / 8;
for (i, chunk) in bytes[..full * 8].chunks_exact(8).enumerate() {
words[i] = u64::from_le_bytes(chunk.try_into().unwrap());
}
let rem = bytes.len() % 8;
if rem > 0 {
let mut last = [0u8; 8];
last[..rem].copy_from_slice(&bytes[full * 8..]);
words[full] = u64::from_le_bytes(last);
}
MemoryBitVec::from_words(words, n)
}
fn count_ones_col(&self, c: usize) -> u64 {
let bytes = self.col_bytes(c);
let full = self.n_rows / 8;
@@ -343,6 +367,26 @@ impl PersistentBitMatrix {
}
}
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
match self {
Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
Self::Packed(m) => m.col_persist(c, path),
Self::Implicit { n_rows, .. } => {
let mut b = PersistentBitVecBuilder::new(*n_rows, path)?;
b.not();
Ok(b)
}
}
}
pub fn col_as_memory(&self, c: usize) -> MemoryBitVec {
match self {
Self::Columnar(m) => MemoryBitVec::from(m.col(c)),
Self::Packed(m) => m.col_as_memory(c),
Self::Implicit { n_rows, .. } => MemoryBitVec::ones(*n_rows),
}
}
pub fn row(&self, slot: usize) -> Box<[bool]> {
match self {
Self::Columnar(m) => m.row(slot),
+39 -35
View File
@@ -188,6 +188,21 @@ impl PersistentBitVecBuilder {
Ok(Self { mmap, n })
}
/// Create a PBIV file from raw packed bit-bytes, zero-padding to the next word boundary.
/// `bytes` is `n.div_ceil(8)` bytes; `n` is the number of bits.
pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
let file_size = HEADER_SIZE + n_bytes_for_words(n);
let file = OpenOptions::new()
.read(true).write(true).create(true).truncate(true)
.open(path)?;
file.set_len(file_size as u64)?;
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
mmap[0..4].copy_from_slice(&MAGIC);
mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
Ok(Self { mmap, n })
}
pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
fs::copy(source.path(), path)?;
let file = OpenOptions::new().read(true).write(true).open(path)?;
@@ -217,6 +232,12 @@ impl PersistentBitVecBuilder {
}
}
fn data_words(&self) -> &[u64] {
let nw = n_words(self.n);
let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
unsafe { std::slice::from_raw_parts(ptr, nw) }
}
// SAFETY: same alignment argument as PersistentBitVec::data_words.
fn data_words_mut(&mut self) -> &mut [u64] {
let nw = n_words(self.n);
@@ -224,41 +245,6 @@ impl PersistentBitVecBuilder {
unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
}
pub fn and(&mut self, other: &PersistentBitVec) {
assert_eq!(self.n, other.n, "length mismatch");
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
*sw &= ow;
}
}
pub fn or(&mut self, other: &PersistentBitVec) {
assert_eq!(self.n, other.n, "length mismatch");
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
*sw |= ow;
}
}
pub fn xor(&mut self, other: &PersistentBitVec) {
assert_eq!(self.n, other.n, "length mismatch");
for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
*sw ^= ow;
}
}
pub fn not(&mut self) {
let rem = self.n % 64;
let words = self.data_words_mut();
for w in words.iter_mut() {
*w ^= u64::MAX;
}
// Zero padding bits in the last word so count_ones / jaccard remain correct.
if rem != 0 {
if let Some(last) = words.last_mut() {
*last &= (1u64 << rem) - 1;
}
}
}
/// Convert a count vector to a bit vector: bit set iff count >= threshold.
/// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
pub fn build_from_counts(
@@ -304,3 +290,21 @@ impl PersistentBitVecBuilder {
self.mmap.flush()
}
}
// ── BitSlice / BitSliceMut impls ──────────────────────────────────────────────
use crate::traits::{BitSlice, BitSliceMut};
impl BitSlice for PersistentBitVec {
fn len(&self) -> usize { self.n }
fn words(&self) -> &[u64] { self.data_words() }
}
impl BitSlice for PersistentBitVecBuilder {
fn len(&self) -> usize { self.n }
fn words(&self) -> &[u64] { self.data_words() }
}
impl BitSliceMut for PersistentBitVecBuilder {
fn words_mut(&mut self) -> &mut [u64] { self.data_words_mut() }
}
+43 -33
View File
@@ -34,6 +34,36 @@ impl PersistentCompactIntVecBuilder {
})
}
/// Create from a [`MemoryIntVec`], copying primary bytes directly into the mmap.
/// O(n) memcpy + O(n_overflow) HashMap clone — no per-slot `set` overhead.
pub fn from_memory(src: &crate::memoryintvec::MemoryIntVec, path: &Path) -> io::Result<Self> {
let n = src.len();
let file = OpenOptions::new()
.read(true).write(true).create(true).truncate(true)
.open(path)?;
file.set_len((HEADER_SIZE + n) as u64)?;
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(src.primary_bytes());
Ok(Self {
path: path.to_path_buf(),
mmap,
n,
overflow: src.overflow_map().clone(),
})
}
/// Create from raw primary bytes + an already-built overflow map (no per-slot overhead).
pub(crate) fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
let n = primary.len();
let file = OpenOptions::new()
.read(true).write(true).create(true).truncate(true)
.open(path)?;
file.set_len((HEADER_SIZE + n) as u64)?;
let mut mmap = unsafe { MmapMut::map_mut(&file)? };
mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
}
/// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
/// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
@@ -91,39 +121,6 @@ impl PersistentCompactIntVecBuilder {
self.n == 0
}
pub fn min(&mut self, other: &PersistentCompactIntVec) {
assert_eq!(self.n, other.len(), "length mismatch");
for (slot, other_val) in other.iter().enumerate() {
if other_val < self.get(slot) {
self.set(slot, other_val);
}
}
}
pub fn max(&mut self, other: &PersistentCompactIntVec) {
assert_eq!(self.n, other.len(), "length mismatch");
for (slot, other_val) in other.iter().enumerate() {
if other_val > self.get(slot) {
self.set(slot, other_val);
}
}
}
pub fn add(&mut self, other: &PersistentCompactIntVec) {
assert_eq!(self.n, other.len(), "length mismatch");
for (slot, other_val) in other.iter().enumerate() {
let cur = self.get(slot);
self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
}
}
pub fn diff(&mut self, other: &PersistentCompactIntVec) {
assert_eq!(self.n, other.len(), "length mismatch");
for (slot, other_val) in other.iter().enumerate() {
self.set(slot, self.get(slot).saturating_sub(other_val));
}
}
/// Flush the primary mmap, then write sorted overflow data + index and fix the header.
pub fn close(self) -> io::Result<()> {
self.mmap.flush()?;
@@ -141,3 +138,16 @@ impl PersistentCompactIntVecBuilder {
finalize_pciv(&path, n, &entries)
}
}
// ── IntSlice / IntSliceMut impls ──────────────────────────────────────────────
use crate::traits::{IntSlice, IntSliceMut};
impl IntSlice for PersistentCompactIntVecBuilder {
fn len(&self) -> usize { self.n }
fn get(&self, slot: usize) -> u32 { self.get(slot) }
}
impl IntSliceMut for PersistentCompactIntVecBuilder {
fn set(&mut self, slot: usize, value: u32) { self.set(slot, value); }
}
+42
View File
@@ -1,4 +1,5 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs::{self, File};
use std::io::{self, BufWriter, Write as _};
use std::path::{Path, PathBuf};
@@ -8,6 +9,7 @@ use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::memoryintvec::MemoryIntVec;
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -194,6 +196,32 @@ impl PackedCompactIntMatrix {
Ok(Self { mmap, n_rows, n_cols, columns })
}
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
let ci = &self.columns[c];
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
}
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
let ci = &self.columns[c];
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
MemoryIntVec::from_primary_and_overflow(primary, overflow)
}
#[inline]
pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
let ci = &self.columns[col];
@@ -442,6 +470,20 @@ impl PersistentCompactIntMatrix {
}
}
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
match self {
Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
Self::Packed(m) => m.col_persist(c, path),
}
}
pub fn col_as_memory(&self, c: usize) -> MemoryIntVec {
match self {
Self::Columnar(m) => MemoryIntVec::from(m.col(c)),
Self::Packed(m) => m.col_as_memory(c),
}
}
pub fn row(&self, slot: usize) -> Box<[u32]> {
match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
}
+5 -1
View File
@@ -4,6 +4,8 @@ mod builder;
mod format;
mod intmatrix;
mod layer_meta;
mod memoryintvec;
mod memoryvec;
mod meta;
mod reader;
pub mod traits;
@@ -13,8 +15,10 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma
pub use builder::PersistentCompactIntVecBuilder;
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
pub use layer_meta::LayerMeta;
pub use memoryintvec::MemoryIntVec;
pub use memoryvec::MemoryBitVec;
pub use reader::PersistentCompactIntVec;
pub use traits::{BitPartials, ColumnWeights, CountPartials};
pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
#[cfg(test)]
#[path = "tests/mod.rs"]
+120
View File
@@ -0,0 +1,120 @@
use std::collections::HashMap;
use std::io;
use std::ops::{Add, AddAssign, Sub, SubAssign};
use std::path::Path;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::traits::{IntSlice, IntSliceMut};
// ── MemoryIntVec ──────────────────────────────────────────────────────────────
#[derive(Clone)]
pub struct MemoryIntVec {
primary: Vec<u8>,
overflow: HashMap<usize, u32>,
n: usize,
}
impl MemoryIntVec {
pub fn new(n: usize) -> Self {
Self { primary: vec![0u8; n], overflow: HashMap::new(), n }
}
pub fn len(&self) -> usize { self.n }
pub fn is_empty(&self) -> bool { self.n == 0 }
/// Construct directly from a pre-built primary array (no overflow — all values < 255).
pub(crate) fn from_primary(primary: Vec<u8>) -> Self {
let n = primary.len();
Self { primary, overflow: HashMap::new(), n }
}
pub(crate) fn from_primary_and_overflow(primary: Vec<u8>, overflow: HashMap<usize, u32>) -> Self {
let n = primary.len();
Self { primary, overflow, n }
}
pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
(0..self.n).map(move |slot| self.get(slot))
}
/// Write to disk and return a writable builder at `path`.
pub fn persist(&self, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
PersistentCompactIntVecBuilder::from_memory(self, path)
}
}
// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
impl IntSlice for MemoryIntVec {
fn len(&self) -> usize { self.n }
fn get(&self, slot: usize) -> u32 {
match self.primary[slot] {
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
v => v as u32,
}
}
}
impl IntSliceMut for MemoryIntVec {
fn set(&mut self, slot: usize, value: u32) {
if value < 255 {
self.primary[slot] = value as u8;
self.overflow.remove(&slot);
} else {
self.primary[slot] = 255;
self.overflow.insert(slot, value);
}
}
}
// ── From conversions ──────────────────────────────────────────────────────────
impl<S: IntSlice> From<&S> for MemoryIntVec {
fn from(src: &S) -> Self {
let mut v = Self::new(src.len());
for slot in 0..src.len() {
let val = src.get(slot);
if val != 0 { v.set(slot, val); }
}
v
}
}
// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
impl<B: IntSlice> Add<&B> for MemoryIntVec {
type Output = MemoryIntVec;
fn add(mut self, rhs: &B) -> MemoryIntVec { IntSliceMut::add(&mut self, rhs); self }
}
impl<B: IntSlice> Sub<&B> for MemoryIntVec {
type Output = MemoryIntVec;
fn sub(mut self, rhs: &B) -> MemoryIntVec { self.diff(rhs); self }
}
// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
impl<B: IntSlice> Add<&B> for &MemoryIntVec {
type Output = MemoryIntVec;
fn add(self, rhs: &B) -> MemoryIntVec { self.clone().add(rhs) }
}
impl<B: IntSlice> Sub<&B> for &MemoryIntVec {
type Output = MemoryIntVec;
fn sub(self, rhs: &B) -> MemoryIntVec { self.clone().sub(rhs) }
}
// ── std::ops — in-place assign ────────────────────────────────────────────────
impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
fn add_assign(&mut self, rhs: &B) { IntSliceMut::add(self, rhs); }
}
impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
}
+138
View File
@@ -0,0 +1,138 @@
use std::io;
use std::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not};
use std::path::Path;
use crate::bitvec::PersistentBitVecBuilder;
use crate::traits::{BitSlice, BitSliceMut};
#[inline]
fn n_words(n: usize) -> usize { n.div_ceil(64) }
// ── MemoryBitVec ──────────────────────────────────────────────────────────────
#[derive(Clone)]
pub struct MemoryBitVec {
words: Vec<u64>,
n: usize,
}
impl MemoryBitVec {
pub fn new(n: usize) -> Self {
Self { words: vec![0u64; n_words(n)], n }
}
pub fn ones(n: usize) -> Self {
let rem = n % 64;
let mut words = vec![u64::MAX; n_words(n)];
if rem != 0 {
if let Some(last) = words.last_mut() { *last = (1u64 << rem) - 1; }
}
Self { words, n }
}
pub(crate) fn from_words(words: Vec<u64>, n: usize) -> Self {
Self { words, n }
}
pub fn len(&self) -> usize { self.n }
pub fn is_empty(&self) -> bool { self.n == 0 }
pub fn get(&self, slot: usize) -> bool {
(self.words[slot >> 6] >> (slot & 63)) & 1 != 0
}
pub fn set(&mut self, slot: usize, value: bool) {
let bit = 1u64 << (slot & 63);
if value { self.words[slot >> 6] |= bit; } else { self.words[slot >> 6] &= !bit; }
}
pub fn count_ones(&self) -> u64 {
self.words.iter().map(|w| w.count_ones() as u64).sum()
}
pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
/// Write to disk and return a writable builder positioned at the same path.
pub fn persist(&self, path: &Path) -> io::Result<PersistentBitVecBuilder> {
let mut b = PersistentBitVecBuilder::new(self.n, path)?;
b.copy_from(self);
Ok(b)
}
}
// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
impl BitSlice for MemoryBitVec {
fn len(&self) -> usize { self.n }
fn words(&self) -> &[u64] { &self.words }
}
impl BitSliceMut for MemoryBitVec {
fn words_mut(&mut self) -> &mut [u64] { &mut self.words }
}
// ── From conversions ──────────────────────────────────────────────────────────
impl<S: BitSlice> From<&S> for MemoryBitVec {
fn from(src: &S) -> Self {
Self { words: src.words().to_vec(), n: src.len() }
}
}
// ── std::ops — owned (consumes lhs) ──────────────────────────────────────────
impl<B: BitSlice> BitAnd<&B> for MemoryBitVec {
type Output = MemoryBitVec;
fn bitand(mut self, rhs: &B) -> MemoryBitVec { self.and(rhs); self }
}
impl<B: BitSlice> BitOr<&B> for MemoryBitVec {
type Output = MemoryBitVec;
fn bitor(mut self, rhs: &B) -> MemoryBitVec { self.or(rhs); self }
}
impl<B: BitSlice> BitXor<&B> for MemoryBitVec {
type Output = MemoryBitVec;
fn bitxor(mut self, rhs: &B) -> MemoryBitVec { self.xor(rhs); self }
}
impl Not for MemoryBitVec {
type Output = MemoryBitVec;
fn not(mut self) -> MemoryBitVec { BitSliceMut::not(&mut self); self }
}
// ── std::ops — borrowed (clones lhs) ─────────────────────────────────────────
impl<B: BitSlice> BitAnd<&B> for &MemoryBitVec {
type Output = MemoryBitVec;
fn bitand(self, rhs: &B) -> MemoryBitVec { self.clone().bitand(rhs) }
}
impl<B: BitSlice> BitOr<&B> for &MemoryBitVec {
type Output = MemoryBitVec;
fn bitor(self, rhs: &B) -> MemoryBitVec { self.clone().bitor(rhs) }
}
impl<B: BitSlice> BitXor<&B> for &MemoryBitVec {
type Output = MemoryBitVec;
fn bitxor(self, rhs: &B) -> MemoryBitVec { self.clone().bitxor(rhs) }
}
impl Not for &MemoryBitVec {
type Output = MemoryBitVec;
fn not(self) -> MemoryBitVec { !self.clone() }
}
// ── std::ops — in-place assign ────────────────────────────────────────────────
impl<B: BitSlice> BitAndAssign<&B> for MemoryBitVec {
fn bitand_assign(&mut self, rhs: &B) { self.and(rhs); }
}
impl<B: BitSlice> BitOrAssign<&B> for MemoryBitVec {
fn bitor_assign(&mut self, rhs: &B) { self.or(rhs); }
}
impl<B: BitSlice> BitXorAssign<&B> for MemoryBitVec {
fn bitxor_assign(&mut self, rhs: &B) { self.xor(rhs); }
}
+11
View File
@@ -353,6 +353,17 @@ impl PersistentCompactIntVec {
}
}
// ── IntSlice impl ─────────────────────────────────────────────────────────────
use crate::traits::IntSlice;
impl IntSlice for PersistentCompactIntVec {
fn len(&self) -> usize { self.n }
fn get(&self, slot: usize) -> u32 { self.get(slot) }
fn sum(&self) -> u64 { self.sum() }
fn count_nonzero(&self) -> u64 { self.count_nonzero() }
}
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
type Item = u32;
type IntoIter = Iter<'a>;
+1
View File
@@ -1,5 +1,6 @@
use tempfile::tempdir;
use crate::traits::BitSliceMut;
use crate::{PersistentBitVec, PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
fn make_bv(bits: &[bool]) -> (tempfile::TempDir, PersistentBitVec) {
+359
View File
@@ -0,0 +1,359 @@
use tempfile::tempdir;
use crate::traits::{BitSlice, BitSliceMut, BitToInt, IntSlice, IntSliceMut, IntToBit};
use crate::{MemoryBitVec, MemoryIntVec, PersistentBitVec, PersistentBitVecBuilder};
// ── MemoryBitVec ──────────────────────────────────────────────────────────────
#[test]
fn mbv_new_all_zero() {
let v = MemoryBitVec::new(10);
assert_eq!(v.len(), 10);
assert!(!(0..10).any(|s| v.get(s)));
assert_eq!(v.count_ones(), 0);
}
#[test]
fn mbv_ones_all_set() {
let v = MemoryBitVec::ones(10);
assert!((0..10).all(|s| v.get(s)));
assert_eq!(v.count_ones(), 10);
assert_eq!(v.count_zeros(), 0);
}
#[test]
fn mbv_ones_no_padding_leak() {
// 5 bits: padding bits in last word must stay 0
let v = MemoryBitVec::ones(5);
assert_eq!(v.words()[0], 0b11111);
}
#[test]
fn mbv_set_get_roundtrip() {
let mut v = MemoryBitVec::new(64);
v.set(0, true);
v.set(63, true);
assert!(v.get(0));
assert!(!v.get(1));
assert!(v.get(63));
assert_eq!(v.count_ones(), 2);
}
#[test]
fn mbv_and() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(1, true);
let mut b = MemoryBitVec::new(4);
b.set(0, true); b.set(2, true);
a.and(&b);
assert!(a.get(0)); assert!(!a.get(1)); assert!(!a.get(2));
}
#[test]
fn mbv_or() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(1, true);
let mut b = MemoryBitVec::new(4);
b.set(0, true); b.set(2, true);
a.or(&b);
assert!(a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
}
#[test]
fn mbv_xor() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(1, true);
let mut b = MemoryBitVec::new(4);
b.set(0, true); b.set(2, true);
a.xor(&b);
assert!(!a.get(0)); assert!(a.get(1)); assert!(a.get(2)); assert!(!a.get(3));
}
#[test]
fn mbv_not() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(2, true);
a.not();
assert!(!a.get(0)); assert!(a.get(1)); assert!(!a.get(2)); assert!(a.get(3));
}
#[test]
fn mbv_not_no_padding_leak() {
let mut v = MemoryBitVec::new(5);
v.not();
assert_eq!(v.count_ones(), 5);
assert_eq!(v.words()[0], 0b11111);
}
#[test]
fn mbv_ops_chaining() {
let mut a = MemoryBitVec::ones(8);
let b = MemoryBitVec::new(8); // all zeros
a.and(&b).or(&b).not();
assert_eq!(a.count_ones(), 8);
}
#[test]
fn mbv_std_ops_owned() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(1, true);
let mut b = MemoryBitVec::new(4);
b.set(1, true); b.set(2, true);
let c = a & &b;
assert!(!c.get(0)); assert!(c.get(1)); assert!(!c.get(2));
}
#[test]
fn mbv_std_ops_assign() {
let mut a = MemoryBitVec::new(4);
a.set(0, true); a.set(1, true);
let mut b = MemoryBitVec::new(4);
b.set(1, true); b.set(2, true);
a &= &b;
assert!(!a.get(0)); assert!(a.get(1));
}
#[test]
fn mbv_from_persistent() {
let dir = tempdir().unwrap();
let path = dir.path().join("v.pbiv");
let mut builder = PersistentBitVecBuilder::new(4, &path).unwrap();
builder.set(1, true); builder.set(3, true);
builder.close().unwrap();
let pv = PersistentBitVec::open(&path).unwrap();
let mv = MemoryBitVec::from(&pv);
assert!(!mv.get(0)); assert!(mv.get(1)); assert!(!mv.get(2)); assert!(mv.get(3));
}
#[test]
fn mbv_persist_roundtrip() {
let dir = tempdir().unwrap();
let path = dir.path().join("out.pbiv");
let mut v = MemoryBitVec::new(8);
v.set(2, true); v.set(5, true);
let builder = v.persist(&path).unwrap();
builder.close().unwrap();
let pv = PersistentBitVec::open(&path).unwrap();
assert!(pv.get(2)); assert!(pv.get(5));
assert_eq!(pv.count_ones(), 2);
}
// ── MemoryIntVec ──────────────────────────────────────────────────────────────
#[test]
fn miv_new_all_zero() {
let v = MemoryIntVec::new(10);
assert_eq!(v.len(), 10);
assert!((0..10).all(|s| v.get(s) == 0));
}
#[test]
fn miv_set_get_roundtrip() {
let mut v = MemoryIntVec::new(4);
v.set(0, 42); v.set(3, 200);
assert_eq!(v.get(0), 42);
assert_eq!(v.get(1), 0);
assert_eq!(v.get(3), 200);
}
#[test]
fn miv_overflow_roundtrip() {
let mut v = MemoryIntVec::new(4);
v.set(1, 1000);
assert_eq!(v.get(1), 1000);
assert_eq!(v.get(0), 0);
}
#[test]
fn miv_inc_dec() {
let mut v = MemoryIntVec::new(4);
v.inc(2); v.inc(2); v.inc(2);
assert_eq!(v.get(2), 3);
v.dec(2);
assert_eq!(v.get(2), 2);
}
#[test]
fn miv_dec_saturates_at_zero() {
let mut v = MemoryIntVec::new(4);
v.dec(0);
assert_eq!(v.get(0), 0);
}
#[test]
fn miv_add_at() {
let mut v = MemoryIntVec::new(4);
v.add_at(1, 100); v.add_at(1, 200);
assert_eq!(v.get(1), 300);
}
#[test]
fn miv_min_max() {
let mut a = MemoryIntVec::new(4);
a.set(0, 5); a.set(1, 2); a.set(2, 8);
let mut b = MemoryIntVec::new(4);
b.set(0, 3); b.set(1, 7); b.set(2, 8);
let mut c = MemoryIntVec::from(&a);
IntSliceMut::min(&mut c, &b);
assert_eq!(c.get(0), 3); assert_eq!(c.get(1), 2); assert_eq!(c.get(2), 8);
let mut d = MemoryIntVec::from(&a);
IntSliceMut::max(&mut d, &b);
assert_eq!(d.get(0), 5); assert_eq!(d.get(1), 7); assert_eq!(d.get(2), 8);
}
#[test]
fn miv_add_diff() {
let mut a = MemoryIntVec::new(3);
a.set(0, 10); a.set(1, 5);
let mut b = MemoryIntVec::new(3);
b.set(0, 3); b.set(1, 8);
let mut c = MemoryIntVec::from(&a);
c.add(&b);
assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
let mut d = MemoryIntVec::from(&a);
d.diff(&b);
assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0); // saturating sub
}
#[test]
fn miv_std_ops() {
let mut a = MemoryIntVec::new(3);
a.set(0, 10); a.set(1, 5);
let mut b = MemoryIntVec::new(3);
b.set(0, 3); b.set(1, 8);
let c = &a + &b;
assert_eq!(c.get(0), 13); assert_eq!(c.get(1), 13);
let d = &a - &b;
assert_eq!(d.get(0), 7); assert_eq!(d.get(1), 0);
}
#[test]
fn miv_from_persistent() {
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
let dir = tempdir().unwrap();
let path = dir.path().join("v.pciv");
let mut b = PersistentCompactIntVecBuilder::new(4, &path).unwrap();
b.set(1, 42); b.set(3, 1000);
b.close().unwrap();
let pv = PersistentCompactIntVec::open(&path).unwrap();
let mv = MemoryIntVec::from(&pv);
assert_eq!(mv.get(0), 0); assert_eq!(mv.get(1), 42); assert_eq!(mv.get(3), 1000);
}
// ── Cross-type conversions ────────────────────────────────────────────────────
#[test]
fn to_bitvec_threshold() {
let mut v = MemoryIntVec::new(5);
v.set(0, 0); v.set(1, 1); v.set(2, 5); v.set(3, 10); v.set(4, 3);
let bv = v.to_bitvec(4); // > 4: slots 2 (5) and 3 (10) pass
assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
assert!(bv.get(3)); assert!(!bv.get(4));
}
#[test]
fn to_presence() {
let mut v = MemoryIntVec::new(4);
v.set(1, 1); v.set(3, 100);
let bv = v.to_presence();
assert!(!bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
}
#[test]
fn to_intvec_roundtrip() {
let mut bv = MemoryBitVec::new(8);
bv.set(0, true); bv.set(3, true); bv.set(7, true);
let iv = bv.to_intvec();
assert_eq!(iv.get(0), 1); assert_eq!(iv.get(1), 0);
assert_eq!(iv.get(3), 1); assert_eq!(iv.get(7), 1);
}
#[test]
fn to_intvec_word_boundary() {
// 65 bits: spans two words
let mut bv = MemoryBitVec::new(65);
bv.set(63, true); bv.set(64, true);
let iv = bv.to_intvec();
assert_eq!(iv.get(63), 1); assert_eq!(iv.get(64), 1); assert_eq!(iv.get(62), 0);
}
#[test]
fn count_bits_accumulates() {
let mut count = MemoryIntVec::new(8);
let mut b1 = MemoryBitVec::new(8);
b1.set(0, true); b1.set(2, true);
let mut b2 = MemoryBitVec::new(8);
b2.set(0, true); b2.set(3, true);
let mut b3 = MemoryBitVec::new(8);
b3.set(2, true); b3.set(3, true);
count.count_bits(&b1).count_bits(&b2).count_bits(&b3);
assert_eq!(count.get(0), 2);
assert_eq!(count.get(2), 2);
assert_eq!(count.get(3), 2);
assert_eq!(count.get(1), 0);
}
#[test]
fn count_bits_skips_zero_words() {
// Entire first word is zero — should not touch those slots
let mut count = MemoryIntVec::new(128);
let mut bv = MemoryBitVec::new(128);
bv.set(64, true); bv.set(127, true);
count.count_bits(&bv);
assert_eq!(count.get(0), 0);
assert_eq!(count.get(64), 1);
assert_eq!(count.get(127), 1);
}
// ── Comparison operators ──────────────────────────────────────────────────────
#[test]
fn cmp_gt() {
let mut v = MemoryIntVec::new(5);
v.set(0, 0); v.set(1, 3); v.set(2, 5); v.set(3, 3); v.set(4, 10);
let bv = v.gt(3);
assert!(!bv.get(0)); assert!(!bv.get(1)); assert!(bv.get(2));
assert!(!bv.get(3)); assert!(bv.get(4));
}
#[test]
fn cmp_geq() {
let mut v = MemoryIntVec::new(4);
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 1);
let bv = v.geq(3);
assert!(!bv.get(0)); assert!(bv.get(1)); assert!(bv.get(2)); assert!(!bv.get(3));
}
#[test]
fn cmp_lt() {
let mut v = MemoryIntVec::new(4);
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 0);
let bv = v.lt(3);
assert!(bv.get(0)); assert!(!bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
}
#[test]
fn cmp_leq() {
let mut v = MemoryIntVec::new(4);
v.set(0, 2); v.set(1, 3); v.set(2, 4); v.set(3, 3);
let bv = v.leq(3);
assert!(bv.get(0)); assert!(bv.get(1)); assert!(!bv.get(2)); assert!(bv.get(3));
}
#[test]
fn filter_pattern() {
// Typical filter: ingroup >= min_count AND outgroup <= max_outgroup
let mut ingroup = MemoryIntVec::new(6);
let mut outgroup = MemoryIntVec::new(6);
// slot 2: ingroup=3, outgroup=0 → keep
// slot 4: ingroup=2, outgroup=1 → drop (outgroup > 0)
// slot 5: ingroup=1, outgroup=0 → drop (ingroup < 2)
ingroup.set(2, 3); ingroup.set(4, 2); ingroup.set(5, 1);
outgroup.set(4, 1);
let out_mask = outgroup.leq(0);
let mut in_mask = ingroup.geq(2);
let keep = in_mask.and(&out_mask);
assert!(!keep.get(0)); assert!(!keep.get(1));
assert!(keep.get(2));
assert!(!keep.get(4)); assert!(!keep.get(5));
}
+3
View File
@@ -1,9 +1,12 @@
mod bitmatrix;
mod bitvec;
mod intmatrix;
mod memoryvec;
use tempfile::tempdir;
use crate::traits::IntSliceMut;
use crate::{PersistentCompactIntVec, PersistentCompactIntVecBuilder};
fn roundtrip(values: &[(usize, u32)], n: usize) -> Vec<u32> {
+208 -1
View File
@@ -1,6 +1,213 @@
use ndarray::{Array1, Array2};
/// Column-level weight statistic — total count or presence count per column.
// ── BitSlice / BitSliceMut ────────────────────────────────────────────────────
/// Read-only view over the u64 word array of a bit vector.
///
/// Bit `i` is in `words()[i >> 6]` at position `i & 63`.
/// Padding bits in the last word are zero.
pub trait BitSlice {
fn len(&self) -> usize;
fn words(&self) -> &[u64];
fn is_empty(&self) -> bool { self.len() == 0 }
fn get(&self, slot: usize) -> bool {
(self.words()[slot >> 6] >> (slot & 63)) & 1 != 0
}
}
/// Mutable view over a bit-vector word array; default methods maintain the
/// zero-padding invariant on the last word.
pub trait BitSliceMut: BitSlice {
fn words_mut(&mut self) -> &mut [u64];
fn copy_from<S: BitSlice>(&mut self, src: &S) -> &mut Self {
assert_eq!(self.len(), src.len(), "BitSlice length mismatch");
self.words_mut().copy_from_slice(src.words());
self
}
fn and<S: BitSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w &= o; }
self
}
fn or<S: BitSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w |= o; }
self
}
fn xor<S: BitSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "BitSlice length mismatch");
for (w, &o) in self.words_mut().iter_mut().zip(other.words()) { *w ^= o; }
self
}
fn not(&mut self) -> &mut Self {
let rem = self.len() % 64;
let words = self.words_mut();
for w in words.iter_mut() { *w ^= u64::MAX; }
if rem != 0 {
if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
}
self
}
}
// ── IntSlice / IntSliceMut ────────────────────────────────────────────────────
/// Read-only access to a compact integer vector (values encoded as u32).
pub trait IntSlice {
fn len(&self) -> usize;
fn get(&self, slot: usize) -> u32;
fn is_empty(&self) -> bool { self.len() == 0 }
fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() }
fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 }
fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) }
fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }
fn gt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v > threshold) }
fn geq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v >= threshold) }
fn cmp_scalar(&self, pred: impl Fn(u32) -> bool) -> MemoryBitVec {
let n = self.len();
let mut words = vec![0u64; n.div_ceil(64)];
for s in 0..n {
if pred(self.get(s)) { words[s >> 6] |= 1u64 << (s & 63); }
}
MemoryBitVec::from_words(words, n)
}
}
/// Mutable access; default methods use only `get` / `set` and maintain the
/// compact encoding invariants on the implementor's side.
pub trait IntSliceMut: IntSlice {
fn set(&mut self, slot: usize, value: u32);
fn inc(&mut self, slot: usize) -> &mut Self {
let v = self.get(slot);
self.set(slot, v.saturating_add(1));
self
}
fn dec(&mut self, slot: usize) -> &mut Self {
let v = self.get(slot);
self.set(slot, v.saturating_sub(1));
self
}
fn add_at(&mut self, slot: usize, delta: u32) -> &mut Self {
let v = self.get(slot);
self.set(slot, v.saturating_add(delta));
self
}
fn copy_from<S: IntSlice>(&mut self, src: &S) -> &mut Self {
assert_eq!(self.len(), src.len(), "IntSlice length mismatch");
for s in 0..src.len() { self.set(s, src.get(s)); }
self
}
fn min<S: IntSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
for s in 0..other.len() { self.set(s, self.get(s).min(other.get(s))); }
self
}
fn max<S: IntSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
for s in 0..other.len() { self.set(s, self.get(s).max(other.get(s))); }
self
}
fn add<S: IntSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
for s in 0..other.len() { self.set(s, self.get(s).saturating_add(other.get(s))); }
self
}
fn diff<S: IntSlice>(&mut self, other: &S) -> &mut Self {
assert_eq!(self.len(), other.len(), "IntSlice length mismatch");
for s in 0..other.len() { self.set(s, self.get(s).saturating_sub(other.get(s))); }
self
}
/// For each slot where `bits` is true, increment `self` by 1.
/// Skips zero words entirely — O(n_ones) rather than O(n).
fn count_bits<B: BitSlice>(&mut self, bits: &B) -> &mut Self {
assert_eq!(self.len(), bits.len(), "IntSlice/BitSlice length mismatch");
for (w_idx, &word) in bits.words().iter().enumerate() {
if word == 0 { continue; }
let base = w_idx * 64;
let mut w = word;
while w != 0 {
let bit = w.trailing_zeros() as usize;
let slot = base + bit;
if slot < self.len() { self.inc(slot); }
w &= w - 1;
}
}
self
}
}
// ── IntSlice → MemoryBitVec conversions ───────────────────────────────────────
use crate::memoryvec::MemoryBitVec;
pub trait IntToBit: IntSlice {
/// Bit set iff value >= threshold. Consistent with `geq` and `build_from_counts`.
fn to_bitvec(&self, threshold: u32) -> MemoryBitVec { self.geq(threshold) }
/// Bit set iff value >= 1 (slot is present).
fn to_presence(&self) -> MemoryBitVec { self.geq(1) }
}
impl<T: IntSlice> IntToBit for T {}
// ── BitSlice → MemoryIntVec conversion ───────────────────────────────────────
use crate::memoryintvec::MemoryIntVec;
pub trait BitToInt: BitSlice {
fn to_intvec(&self) -> MemoryIntVec {
let n = self.len();
let mut primary = vec![0u8; n];
// Unpack u64 words: each byte within a word yields 8 output bytes.
// Values are always 0 or 1 → no overflow entries needed.
let words = self.words();
let full_words = n / 64;
for (w_idx, &word) in words[..full_words].iter().enumerate() {
let base = w_idx * 64;
for byte_off in 0..8usize {
let byte = (word >> (byte_off * 8)) as u8;
let out = &mut primary[base + byte_off * 8..base + byte_off * 8 + 8];
for bit in 0..8usize {
out[bit] = (byte >> bit) & 1;
}
}
}
// Remaining bits in the last partial word
let rem = n % 64;
if rem > 0 {
let word = words[full_words];
let base = full_words * 64;
for bit in 0..rem {
primary[base + bit] = ((word >> bit) & 1) as u8;
}
}
MemoryIntVec::from_primary(primary)
}
}
impl<T: BitSlice> BitToInt for T {}
// ── Column-level weight statistic — total count or presence count per column.
/// Additive across layers and partitions; used as denominator in normalised distances.
///
/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
+2
View File
@@ -98,7 +98,9 @@ impl KmerIndex {
fs::File::create(output.join(SENTINEL_INDEXED))?;
let idx = KmerIndex::open(output)?;
let t_pack = Stage::start("pack");
idx.pack_matrices()?;
rep.push(t_pack.stop());
Ok(idx)
}
}