Push mtzqmmrlmzzx #34
@@ -13,6 +13,26 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
|
||||
// Index entry: slot(u64) + pos(u64) = 16 bytes.
|
||||
pub const INDEX_ENTRY_SIZE: usize = 16;
|
||||
|
||||
/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels.
|
||||
///
|
||||
/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values.
|
||||
/// `overflow` yields the true values (≥ 255) for each sentinel, in any order.
|
||||
#[inline]
|
||||
pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator<Item = u32>) -> u64 {
|
||||
let raw: u64 = primary.iter().map(|&b| b as u64).sum();
|
||||
let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64));
|
||||
raw - 255 * n + ov
|
||||
}
|
||||
|
||||
/// Count non-zero values in a compact-int primary byte slice.
|
||||
///
|
||||
/// Overflow sentinels (255) are always non-zero by construction, so a single
|
||||
/// `b != 0` test is sufficient — no overflow map lookup needed.
|
||||
#[inline]
|
||||
pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 {
|
||||
primary.iter().filter(|&&b| b != 0).count() as u64
|
||||
}
|
||||
|
||||
/// Parse a single overflow entry `(slot, value)` from a byte slice.
|
||||
#[inline]
|
||||
pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
|
||||
|
||||
@@ -11,7 +11,7 @@ use rayon::prelude::*;
|
||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
@@ -230,16 +230,24 @@ impl PackedCompactIntMatrix {
|
||||
|
||||
pub(crate) fn sum(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
|
||||
self.columns.par_iter()
|
||||
.map(|ci| {
|
||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||
let overflow = (0..ci.n_overflow)
|
||||
.map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1);
|
||||
byte_sum(primary, overflow)
|
||||
})
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn count_nonzero(&self) -> Array1<u64> {
|
||||
Array1::from_vec(
|
||||
(0..self.n_cols).into_par_iter()
|
||||
.map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64)
|
||||
self.columns.par_iter()
|
||||
.map(|ci| {
|
||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||
byte_count_nonzero(primary)
|
||||
})
|
||||
.collect()
|
||||
)
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@ pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_ma
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
||||
pub use layer_meta::LayerMeta;
|
||||
pub use memoryintvec::MemoryIntVec;
|
||||
pub use memoryintvec::{MemoryIntIter, MemoryIntVec};
|
||||
pub use memoryvec::{MemoryBitIter, MemoryBitVec};
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use traits::{BitPartials, BitSlice, BitSliceMut, BitToInt, ColumnWeights, CountPartials, IntSlice, IntSliceMut, IntToBit};
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::ops::{Add, AddAssign, Sub, SubAssign};
|
||||
use std::path::Path;
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::format::{byte_count_nonzero, byte_sum};
|
||||
use crate::traits::{IntSlice, IntSliceMut};
|
||||
|
||||
// ── MemoryIntVec ──────────────────────────────────────────────────────────────
|
||||
@@ -37,8 +38,31 @@ impl MemoryIntVec {
|
||||
pub(crate) fn primary_bytes(&self) -> &[u8] { &self.primary }
|
||||
pub(crate) fn overflow_map(&self) -> &HashMap<usize, u32> { &self.overflow }
|
||||
|
||||
pub fn iter(&self) -> impl Iterator<Item = u32> + '_ {
|
||||
(0..self.n).map(move |slot| self.get(slot))
|
||||
pub fn get(&self, slot: usize) -> u32 {
|
||||
match self.primary[slot] {
|
||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn sum(&self) -> u64 {
|
||||
byte_sum(&self.primary, self.overflow.values().copied())
|
||||
}
|
||||
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
byte_count_nonzero(&self.primary)
|
||||
}
|
||||
|
||||
pub fn filled(n: usize, value: u32) -> Self {
|
||||
if value < 255 {
|
||||
Self { primary: vec![value as u8; n], overflow: HashMap::new(), n }
|
||||
} else {
|
||||
Self { primary: vec![255u8; n], overflow: (0..n).map(|i| (i, value)).collect(), n }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter(&self) -> MemoryIntIter<'_> {
|
||||
MemoryIntIter { vec: self, slot: 0 }
|
||||
}
|
||||
|
||||
/// Write to disk and return a writable builder at `path`.
|
||||
@@ -51,13 +75,9 @@ impl MemoryIntVec {
|
||||
|
||||
impl IntSlice for MemoryIntVec {
|
||||
fn len(&self) -> usize { self.n }
|
||||
|
||||
fn get(&self, slot: usize) -> u32 {
|
||||
match self.primary[slot] {
|
||||
255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
|
||||
v => v as u32,
|
||||
}
|
||||
}
|
||||
fn get(&self, slot: usize) -> u32 { self.get(slot) }
|
||||
fn sum(&self) -> u64 { self.sum() }
|
||||
fn count_nonzero(&self) -> u64 { self.count_nonzero() }
|
||||
}
|
||||
|
||||
impl IntSliceMut for MemoryIntVec {
|
||||
@@ -118,3 +138,34 @@ impl<B: IntSlice> AddAssign<&B> for MemoryIntVec {
|
||||
impl<B: IntSlice> SubAssign<&B> for MemoryIntVec {
|
||||
fn sub_assign(&mut self, rhs: &B) { self.diff(rhs); }
|
||||
}
|
||||
|
||||
// ── Iterator ──────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct MemoryIntIter<'a> {
|
||||
vec: &'a MemoryIntVec,
|
||||
slot: usize,
|
||||
}
|
||||
|
||||
impl Iterator for MemoryIntIter<'_> {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<u32> {
|
||||
if self.slot >= self.vec.n { return None; }
|
||||
let v = self.vec.get(self.slot);
|
||||
self.slot += 1;
|
||||
Some(v)
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> (usize, Option<usize>) {
|
||||
let rem = self.vec.n - self.slot;
|
||||
(rem, Some(rem))
|
||||
}
|
||||
}
|
||||
|
||||
impl ExactSizeIterator for MemoryIntIter<'_> {}
|
||||
|
||||
impl<'a> IntoIterator for &'a MemoryIntVec {
|
||||
type Item = u32;
|
||||
type IntoIter = MemoryIntIter<'a>;
|
||||
fn into_iter(self) -> MemoryIntIter<'a> { self.iter() }
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::path::{Path, PathBuf};
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
use crate::format::{HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
||||
use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
|
||||
|
||||
pub struct PersistentCompactIntVec {
|
||||
mmap: Mmap,
|
||||
@@ -129,14 +129,14 @@ impl PersistentCompactIntVec {
|
||||
u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn sum(&self) -> u64 {
|
||||
self.iter().map(|v| v as u64).sum()
|
||||
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||
byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i)))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn count_nonzero(&self) -> u64 {
|
||||
self.iter().filter(|&v| v > 0).count() as u64
|
||||
let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
|
||||
byte_count_nonzero(primary)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -357,8 +357,6 @@ use crate::traits::IntSlice;
|
||||
impl IntSlice for PersistentCompactIntVec {
|
||||
fn len(&self) -> usize { self.n }
|
||||
fn get(&self, slot: usize) -> u32 { self.get(slot) }
|
||||
fn sum(&self) -> u64 { self.sum() }
|
||||
fn count_nonzero(&self) -> u64 { self.count_nonzero() }
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a PersistentCompactIntVec {
|
||||
|
||||
@@ -68,8 +68,8 @@ pub trait IntSlice {
|
||||
fn get(&self, slot: usize) -> u32;
|
||||
fn is_empty(&self) -> bool { self.len() == 0 }
|
||||
fn iter(&self) -> impl Iterator<Item = u32> + '_ { (0..self.len()).map(|i| self.get(i)) }
|
||||
fn sum(&self) -> u64 { (0..self.len()).map(|s| self.get(s) as u64).sum() }
|
||||
fn count_nonzero(&self) -> u64 { (0..self.len()).filter(|&s| self.get(s) > 0).count() as u64 }
|
||||
fn sum(&self) -> u64 { self.iter().map(|v| v as u64).sum() }
|
||||
fn count_nonzero(&self) -> u64 { self.iter().filter(|v| *v > 0).count() as u64 }
|
||||
|
||||
fn lt(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v < threshold) }
|
||||
fn leq(&self, threshold: u32) -> MemoryBitVec { self.cmp_scalar(|v| v <= threshold) }
|
||||
|
||||
Reference in New Issue
Block a user