perf: optimize aggregation with byte-level helpers and direct mmap

Introduce `byte_sum` and `byte_count_nonzero` to efficiently aggregate compact-int byte slices, bypassing per-element decoding and overflow map lookups. Refactor `sum()` and `count_nonzero()` across the matrix, reader, and traits modules to use direct memory-mapped slice iteration and idiomatic Rust iterators. Additionally, expose `MemoryIntIter` publicly and implement `IntoIterator` and `IntSlice` for `MemoryIntVec` to enable standard iteration and delegate aggregation to the new helpers.
This commit is contained in:
Eric Coissac
2026-06-17 00:13:16 +02:00
parent d1717688d2
commit df7b400fda
6 changed files with 101 additions and 24 deletions
+13 -5
View File
@@ -11,7 +11,7 @@ use rayon::prelude::*;
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
use crate::builder::PersistentCompactIntVecBuilder;
use crate::memoryintvec::MemoryIntVec;
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -230,16 +230,24 @@ impl PackedCompactIntMatrix {
pub(crate) fn sum(&self) -> Array1<u64> {
Array1::from_vec(
(0..self.n_cols).into_par_iter()
.map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
self.columns.par_iter()
.map(|ci| {
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
let overflow = (0..ci.n_overflow)
.map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
byte_sum(primary, overflow)
})
.collect()
)
}
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
Array1::from_vec(
(0..self.n_cols).into_par_iter()
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
self.columns.par_iter()
.map(|ci| {
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
byte_count_nonzero(primary)
})
.collect()
)
}