Add column group operations and mask_with trait

Introduce the `ColGroup` struct and `MatrixGroupOps` trait to manage named subsets of column indices and perform additive aggregations (count, sum, any). Implement these operations for `PersistentBitMatrix` and `PersistentCompactIntMatrix`, applying size-optimized branches for presence counts and direct accumulation for small groups. Additionally, add a `mask_with` trait method that efficiently zero-sets elements based on a mask, optimized for sparse masks with O(n_zeros) complexity. Include comprehensive tests covering overflow handling, slot masking, and result additivity across partitioned data.
This commit is contained in:
Eric Coissac
2026-06-17 14:50:28 +02:00
parent 93559c3294
commit 1d38d87ff9
7 changed files with 391 additions and 2 deletions
+49 -1
View File
@@ -10,11 +10,13 @@ use rayon::prelude::*;
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
use crate::builder::PersistentCompactIntVecBuilder;
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
use crate::memoryintvec::MemoryIntVec;
use crate::memoryvec::MemoryBitVec;
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
use crate::traits::IntSlice;
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
fn col_path(dir: &Path, col: usize) -> PathBuf {
dir.join(format!("col_{col:06}.pciv"))
@@ -624,3 +626,49 @@ impl PersistentCompactIntMatrixBuilder {
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
}
}
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
impl MatrixGroupOps for PersistentCompactIntMatrix {
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
let n = self.n();
if g.indices.len() < 255 {
// Fast path: counts fit in u8 — accumulate directly into raw bytes,
// no overflow map involved.
let mut primary = vec![0u8; n];
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
inc_primary_bits(&mut primary, &mask);
}
MemoryIntVec::from_primary(primary)
} else {
// Slow path (rare): use IntSliceMut::count_bits which handles overflow.
let mut result = MemoryIntVec::new(n);
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
result.count_bits(&mask);
}
result
}
}
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
let n = self.n();
let mut result = MemoryIntVec::new(n);
for &c in &g.indices {
let view = self.col_view(c);
IntSliceMut::add(&mut result, &view);
}
result
}
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
let n = self.n();
let mut result = MemoryBitVec::new(n);
for &c in &g.indices {
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
result.or(&mask);
}
result
}
}