feat: introduce unified column view types for bit and int matrices

This commit introduces `BitColView` and `IntColView` to abstract over Columnar and Packed storage formats, implementing `BitSlice` and `IntSlice` for uniform column access. It adds `col_view()` accessors to `PersistentBitMatrix` and `PackedCompactIntMatrix`, explicitly panicking on implicit variants. The new types are publicly re-exported, and unit tests are added to validate per-element retrieval, aggregation methods, and parity with the original columnar representation.
This commit is contained in:
Eric Coissac
2026-06-17 14:48:31 +02:00
parent 1f0d77d5bf
commit 93559c3294
5 changed files with 186 additions and 5 deletions
+28
View File
@@ -183,6 +183,26 @@ impl BitSlice for PackedCol<'_> {
fn words(&self) -> &[u64] { self.words } fn words(&self) -> &[u64] { self.words }
} }
// ── BitColView — uniform column access across Columnar and Packed ─────────────
enum BitColViewInner<'a> {
Columnar(&'a PersistentBitVec),
Packed(PackedCol<'a>),
}
/// Opaque column view returned by [`PersistentBitMatrix::col_view`].
/// Implements [`BitSlice`] uniformly for both Columnar and Packed matrix formats.
pub struct BitColView<'a>(BitColViewInner<'a>);
impl BitSlice for BitColView<'_> {
fn len(&self) -> usize {
match &self.0 { BitColViewInner::Columnar(c) => c.len(), BitColViewInner::Packed(c) => c.len() }
}
fn words(&self) -> &[u64] {
match &self.0 { BitColViewInner::Columnar(c) => c.words(), BitColViewInner::Packed(c) => c.words() }
}
}
/// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files. /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> { pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
let packed_path = dir.join("matrix.pbmx"); let packed_path = dir.join("matrix.pbmx");
@@ -298,6 +318,14 @@ impl PersistentBitMatrix {
} }
} }
pub fn col_view(&self, c: usize) -> BitColView<'_> {
match self {
Self::Columnar(m) => BitColView(BitColViewInner::Columnar(m.col(c))),
Self::Packed(m) => BitColView(BitColViewInner::Packed(m.col_slice(c))),
Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
}
}
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> { pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
match self { match self {
Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path), Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
+44
View File
@@ -224,6 +224,43 @@ impl Iterator for PackedIntColIter<'_> {
impl ExactSizeIterator for PackedIntColIter<'_> {} impl ExactSizeIterator for PackedIntColIter<'_> {}
// ── IntColView — uniform column access across Columnar and Packed ─────────────
enum IntColViewInner<'a> {
Columnar(&'a PersistentCompactIntVec),
Packed(PackedIntCol<'a>),
}
/// Opaque column view returned by [`PersistentCompactIntMatrix::col_view`].
/// Implements [`IntSlice`] uniformly for both Columnar and Packed matrix formats.
pub struct IntColView<'a>(IntColViewInner<'a>);
impl IntSlice for IntColView<'_> {
fn len(&self) -> usize {
match &self.0 { IntColViewInner::Columnar(c) => c.len(), IntColViewInner::Packed(c) => c.len() }
}
fn get(&self, slot: usize) -> u32 {
match &self.0 { IntColViewInner::Columnar(c) => c.get(slot), IntColViewInner::Packed(c) => c.get(slot) }
}
fn primary_bytes(&self) -> &[u8] {
match &self.0 { IntColViewInner::Columnar(c) => c.primary_bytes(), IntColViewInner::Packed(c) => c.primary_bytes() }
}
fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
// Box<dyn Iterator> implements Iterator, satisfying RPITIT across two distinct types.
let it: Box<dyn Iterator<Item = (usize, u32)> + '_> = match &self.0 {
IntColViewInner::Columnar(c) => Box::new(c.overflow_entries()),
IntColViewInner::Packed(c) => Box::new(c.overflow_entries()),
};
it
}
fn sum(&self) -> u64 {
match &self.0 { IntColViewInner::Columnar(c) => c.sum(), IntColViewInner::Packed(c) => c.sum() }
}
fn count_nonzero(&self) -> u64 {
match &self.0 { IntColViewInner::Columnar(c) => c.count_nonzero(), IntColViewInner::Packed(c) => c.count_nonzero() }
}
}
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
pub struct PackedCompactIntMatrix { pub struct PackedCompactIntMatrix {
@@ -481,6 +518,13 @@ impl PersistentCompactIntMatrix {
} }
} }
pub fn col_view(&self, c: usize) -> IntColView<'_> {
match self {
Self::Columnar(m) => IntColView(IntColViewInner::Columnar(m.col(c))),
Self::Packed(m) => IntColView(IntColViewInner::Packed(m.col_slice(c))),
}
}
pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> { pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
match self { match self {
Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path), Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
+2 -2
View File
@@ -11,9 +11,9 @@ mod reader;
pub mod traits; pub mod traits;
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix}; pub use bitmatrix::{BitColView, PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
pub use builder::PersistentCompactIntVecBuilder; pub use builder::PersistentCompactIntVecBuilder;
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use intmatrix::{IntColView, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
pub use layer_meta::LayerMeta; pub use layer_meta::LayerMeta;
pub use memoryintvec::{MemoryIntIter, MemoryIntVec}; pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
pub use memoryvec::MemoryBitVec; pub use memoryvec::MemoryBitVec;
+55 -1
View File
@@ -1,6 +1,6 @@
use tempfile::tempdir; use tempfile::tempdir;
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
use crate::traits::{BitPartials, BitSlice, BitSliceMut}; use crate::traits::{BitPartials, BitSlice, BitSliceMut};
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
@@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() {
let full = m.hamming_dist_matrix(); let full = m.hamming_dist_matrix();
assert_eq!(partial, full); assert_eq!(partial, full);
} }
// ── col_view on Packed ────────────────────────────────────────────────────────
#[test]
fn col_view_packed_values() {
let (dir, _) = make_matrix(&[
&[true, false, true, true],
&[false, true, false, true],
]);
pack_bit_matrix(&dir.path().join("presence")).unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
// col 0: [T, F, T, T]
let v0 = m.col_view(0);
assert_eq!(v0.len(), 4);
assert_eq!(v0.get(0), true);
assert_eq!(v0.get(1), false);
assert_eq!(v0.get(2), true);
assert_eq!(v0.get(3), true);
assert_eq!(v0.count_ones(), 3);
// col 1: [F, T, F, T]
let v1 = m.col_view(1);
assert_eq!(v1.get(0), false);
assert_eq!(v1.get(1), true);
assert_eq!(v1.get(2), false);
assert_eq!(v1.get(3), true);
assert_eq!(v1.count_ones(), 2);
}
#[test]
fn col_view_packed_matches_columnar() {
let data: &[&[bool]] = &[
&[true, false, true, false, true, true, false, true],
&[false, false, true, true, false, true, true, false],
&[true, true, true, false, false, false, true, true],
];
let (dir_col, m_col) = make_matrix(data);
let (dir_pack, _) = make_matrix(data);
pack_bit_matrix(&dir_pack.path().join("presence")).unwrap();
let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap();
for c in 0..data.len() {
let col_ref = m_col.col(c);
let col_view = m_pack.col_view(c);
assert_eq!(col_view.len(), col_ref.len(), "col={c} len");
for s in 0..col_ref.len() {
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
}
assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones");
assert_eq!(col_view.words(), col_ref.words(), "col={c} words");
}
drop(dir_col);
}
+57 -2
View File
@@ -1,7 +1,7 @@
use tempfile::tempdir; use tempfile::tempdir;
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
use crate::traits::CountPartials; use crate::traits::{CountPartials, IntSlice};
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
let n = cols.first().map_or(0, |c| c.len()); let n = cols.first().map_or(0, |c| c.len());
@@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() {
} }
} }
#[test]
fn col_view_packed_values() {
// Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view().
let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]);
pack_compact_int_matrix(&dir.path().join("counts")).unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
// col 0: [10, 300, 500] — two overflow slots
let v0 = m.col_view(0);
assert_eq!(v0.get(0), 10);
assert_eq!(v0.get(1), 300);
assert_eq!(v0.get(2), 500);
assert_eq!(v0.sum(), 810);
assert_eq!(v0.count_nonzero(), 3);
let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect();
ov0.sort_unstable_by_key(|&(s, _)| s);
assert_eq!(ov0, vec![(1, 300), (2, 500)]);
// col 1: [200, 50, 1000] — one overflow slot
let v1 = m.col_view(1);
assert_eq!(v1.get(0), 200);
assert_eq!(v1.get(1), 50);
assert_eq!(v1.get(2), 1000);
let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect();
ov1.sort_unstable_by_key(|&(s, _)| s);
assert_eq!(ov1, vec![(2, 1000)]);
}
#[test]
fn col_view_packed_matches_columnar() {
// Same data, compare col_view() on Packed against col() on Columnar slot-by-slot.
let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]];
let (dir_col, m_col) = make_matrix(data);
// Re-build in a separate dir so we can pack without touching m_col's files.
let (dir_pack, _) = make_matrix(data);
pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap();
let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap();
for c in 0..data.len() {
let col_ref = m_col.col(c);
let col_view = m_pack.col_view(c);
assert_eq!(col_view.len(), col_ref.len());
for s in 0..col_ref.len() {
assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
}
assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
let mut ov_ref: Vec<(usize, u32)> = col_ref.overflow_entries().collect();
ov_view.sort_unstable_by_key(|&(s, _)| s);
ov_ref.sort_unstable_by_key(|&(s, _)| s);
assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
}
drop(dir_col);
}
#[test] #[test]
fn partial_relfreq_bray_additive_across_split() { fn partial_relfreq_bray_additive_across_split() {
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up. // Split rows [1,2,3,4,5] between two matrices; partial sums should add up.