From df7b400fdabfcb79e69ffd74e14b756edbd9ac84 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 17 Jun 2026 00:13:16 +0200 Subject: [PATCH] perf: optimize aggregation with byte-level helpers and direct mmap Introduce `byte_sum` and `byte_count_nonzero` to efficiently aggregate compact-int byte slices, bypassing per-element decoding and overflow map lookups. Refactor `sum()` and `count_nonzero()` across the matrix, reader, and traits modules to use direct memory-mapped slice iteration and idiomatic Rust iterators. Additionally, expose `MemoryIntIter` publicly and implement `IntoIterator` and `IntSlice` for `MemoryIntVec` to enable standard iteration and delegate aggregation to the new helpers. --- src/obicompactvec/src/format.rs | 20 ++++++++ src/obicompactvec/src/intmatrix.rs | 18 +++++-- src/obicompactvec/src/lib.rs | 2 +- src/obicompactvec/src/memoryintvec.rs | 69 +++++++++++++++++++++++---- src/obicompactvec/src/reader.rs | 12 ++--- src/obicompactvec/src/traits.rs | 4 +- 6 files changed, 101 insertions(+), 24 deletions(-) diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs index 265167d..b3c24d0 100644 --- a/src/obicompactvec/src/format.rs +++ b/src/obicompactvec/src/format.rs @@ -13,6 +13,26 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12; // Index entry: slot(u64) + pos(u64) = 16 bytes. pub const INDEX_ENTRY_SIZE: usize = 16; +/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels. +/// +/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values. +/// `overflow` yields the true values (≥ 255) for each sentinel, in any order. +#[inline] +pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator) -> u64 { + let raw: u64 = primary.iter().map(|&b| b as u64).sum(); + let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64)); + raw - 255 * n + ov +} + +/// Count non-zero values in a compact-int primary byte slice. +/// +/// Overflow sentinels (255) are always non-zero by construction, so a single +/// `b != 0` test is sufficient — no overflow map lookup needed. +#[inline] +pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 { + primary.iter().filter(|&&b| b != 0).count() as u64 +} + /// Parse a single overflow entry `(slot, value)` from a byte slice. #[inline] pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) { diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 9d97f8e..69240dd 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -11,7 +11,7 @@ use rayon::prelude::*; use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix}; use crate::builder::PersistentCompactIntVecBuilder; use crate::memoryintvec::MemoryIntVec; -use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; +use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; @@ -230,16 +230,24 @@ impl PackedCompactIntMatrix { pub(crate) fn sum(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum()) + self.columns.par_iter() + .map(|ci| { + let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; + let overflow = (0..ci.n_overflow) + .map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1); + byte_sum(primary, overflow) + }) .collect() ) } pub(crate) fn count_nonzero(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64) + self.columns.par_iter() + .map(|ci| { + let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; + byte_count_nonzero(primary) + }) .collect() ) } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index fb2d5e2..ced509b 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -15,7 +15,7 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma pub use builder::PersistentCompactIntVecBuilder; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; -pub use memoryintvec::MemoryIntVec; +pub use memoryintvec::{MemoryIntIter, MemoryIntVec}; pub use memoryvec::{MemoryBitIter, MemoryBitVec}; pub use reader::PersistentCompactIntVec; pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit}; diff --git a/src/obicompactvec/src/memoryintvec.rs b/src/obicompactvec/src/memoryintvec.rs index 486a0f1..735431c 100644 --- a/src/obicompactvec/src/memoryintvec.rs +++ b/src/obicompactvec/src/memoryintvec.rs @@ -4,6 +4,7 @@ use std::ops::{Add, AddAssign, Sub, SubAssign}; use std::path::Path; use crate::builder::PersistentCompactIntVecBuilder; +use crate::format::{byte_count_nonzero, byte_sum}; use crate::traits::{IntSlice, IntSliceMut}; // ── MemoryIntVec ────────────────────────────────────────────────────────────── @@ -37,8 +38,31 @@ impl MemoryIntVec { pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary } pub(crate) fn overflow_map(&self) -> &HashMap { &self.overflow } - pub fn iter(&self) -> impl Iterator + '_ { - (0..self.n).map(move |slot| self.get(slot)) + pub fn get(&self, slot: usize) -> u32 { + match self.primary[slot] { + 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), + v => v as u32, + } + } + + pub fn sum(&self) -> u64 { + byte_sum(&self.primary, self.overflow.values().copied()) + } + + pub fn count_nonzero(&self) -> u64 { + byte_count_nonzero(&self.primary) + } + + pub fn filled(n: usize, value: u32) -> Self { + if value < 255 { + Self { primary: vec![value as u8; n], overflow: HashMap::new(), n } + } else { + Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n } + } + } + + pub fn iter(&self) -> MemoryIntIter<'_> { + MemoryIntIter { vec: self, slot: 0 } } /// Write to disk and return a writable builder at `path`. @@ -51,13 +75,9 @@ impl MemoryIntVec { impl IntSlice for MemoryIntVec { fn len(&self) -> usize { self.n } - - fn get(&self, slot: usize) -> u32 { - match self.primary[slot] { - 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), - v => v as u32, - } - } + fn get(&self, slot: usize) -> u32 { self.get(slot) } + fn sum(&self) -> u64 { self.sum() } + fn count_nonzero(&self) -> u64 { self.count_nonzero() } } impl IntSliceMut for MemoryIntVec { @@ -118,3 +138,34 @@ impl AddAssign<&B> for MemoryIntVec { impl SubAssign<&B> for MemoryIntVec { fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); } } + +// ── Iterator ────────────────────────────────────────────────────────────────── + +pub struct MemoryIntIter<'a> { + vec: &'a MemoryIntVec, + slot: usize, +} + +impl Iterator for MemoryIntIter<'_> { + type Item = u32; + + fn next(&mut self) -> Option { + if self.slot >= self.vec.n { return None; } + let v = self.vec.get(self.slot); + self.slot += 1; + Some(v) + } + + fn size_hint(&self) -> (usize, Option) { + let rem = self.vec.n - self.slot; + (rem, Some(rem)) + } +} + +impl ExactSizeIterator for MemoryIntIter<'_> {} + +impl<'a> IntoIterator for &'a MemoryIntVec { + type Item = u32; + type IntoIter = MemoryIntIter<'a>; + fn into_iter(self) -> MemoryIntIter<'a> { self.iter() } +} diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index bd3d7d7..4d5b9e0 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -4,7 +4,7 @@ use std::path::{Path, PathBuf}; use memmap2::Mmap; -use crate::format::{HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry}; +use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry}; pub struct PersistentCompactIntVec { mmap: Mmap, @@ -129,14 +129,14 @@ impl PersistentCompactIntVec { u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) } - #[inline] pub fn sum(&self) -> u64 { - self.iter().map(|v| v as u64).sum() + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i))) } - #[inline] pub fn count_nonzero(&self) -> u64 { - self.iter().filter(|&v| v > 0).count() as u64 + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + byte_count_nonzero(primary) } #[inline] @@ -357,8 +357,6 @@ use crate::traits::IntSlice; impl IntSlice for PersistentCompactIntVec { fn len(&self) -> usize { self.n } fn get(&self, slot: usize) -> u32 { self.get(slot) } - fn sum(&self) -> u64 { self.sum() } - fn count_nonzero(&self) -> u64 { self.count_nonzero() } } impl<'a> IntoIterator for &'a PersistentCompactIntVec { diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index 32e40a1..ff9df71 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -68,8 +68,8 @@ pub trait IntSlice { fn get(&self, slot: usize) -> u32; fn is_empty(&self) -> bool { self.len() == 0 } fn iter(&self) -> impl Iterator + '_ { (0..self.len()).map(|i| self.get(i)) } - fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() } - fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 } + fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() } + fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 } fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) } fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }