Add column group operations and mask_with trait
Introduce the `ColGroup` struct and `MatrixGroupOps` trait to manage named subsets of column indices and perform additive aggregations (count, sum, any). Implement these operations for `PersistentBitMatrix` and `PersistentCompactIntMatrix`, applying size-optimized branches for presence counts and direct accumulation for small groups. Additionally, add a `mask_with` trait method that efficiently zero-sets elements based on a mask, optimized for sparse masks with O(n_zeros) complexity. Include comprehensive tests covering overflow handling, slot masking, and result additivity across partitioned data.
This commit is contained in:
@@ -10,11 +10,13 @@ use rayon::prelude::*;
|
||||
|
||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::colgroup::{ColGroup, MatrixGroupOps, inc_primary_bits};
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::memoryvec::MemoryBitVec;
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
use crate::traits::IntSlice;
|
||||
use crate::traits::{BitSliceMut, IntSlice, IntSliceMut};
|
||||
|
||||
fn col_path(dir: &Path, col: usize) -> PathBuf {
|
||||
dir.join(format!("col_{col:06}.pciv"))
|
||||
@@ -624,3 +626,49 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
|
||||
}
|
||||
}
|
||||
|
||||
// ── MatrixGroupOps ────────────────────────────────────────────────────────────
|
||||
|
||||
impl MatrixGroupOps for PersistentCompactIntMatrix {
|
||||
fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> MemoryIntVec {
|
||||
let n = self.n();
|
||||
if g.indices.len() < 255 {
|
||||
// Fast path: counts fit in u8 — accumulate directly into raw bytes,
|
||||
// no overflow map involved.
|
||||
let mut primary = vec![0u8; n];
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
inc_primary_bits(&mut primary, &mask);
|
||||
}
|
||||
MemoryIntVec::from_primary(primary)
|
||||
} else {
|
||||
// Slow path (rare): use IntSliceMut::count_bits which handles overflow.
|
||||
let mut result = MemoryIntVec::new(n);
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
result.count_bits(&mask);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
fn partial_group_sum(&self, g: &ColGroup) -> MemoryIntVec {
|
||||
let n = self.n();
|
||||
let mut result = MemoryIntVec::new(n);
|
||||
for &c in &g.indices {
|
||||
let view = self.col_view(c);
|
||||
IntSliceMut::add(&mut result, &view);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> MemoryBitVec {
|
||||
let n = self.n();
|
||||
let mut result = MemoryBitVec::new(n);
|
||||
for &c in &g.indices {
|
||||
let mask = self.col_view(c).cmp_scalar(|v| v >= threshold);
|
||||
result.or(&mask);
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user