From aa98e82875c3cc07ddf13d4c82180bbace34c2ec Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Wed, 17 Jun 2026 09:28:16 +0200 Subject: [PATCH] refactor: introduce PackedIntCol view and use iterators Centralizes overflow handling and improves modularity by replacing manual mmap indexing and row loops with composable iterator patterns. This change leverages Rust's iterator traits for efficient, idiomatic column traversal while encapsulating data access in a dedicated view struct. --- src/obicompactvec/src/intmatrix.rs | 229 ++++++++++++++++++----------- 1 file changed, 146 insertions(+), 83 deletions(-) diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 69240dd..0be16fb 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -14,6 +14,7 @@ use crate::memoryintvec::MemoryIntVec; use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; +use crate::traits::IntSlice; fn col_path(dir: &Path, col: usize) -> PathBuf { dir.join(format!("col_{col:06}.pciv")) @@ -124,6 +125,107 @@ struct ColInfo { index: Vec<(usize, usize)>, } +// ── PackedIntCol — lightweight column view backed by the shared mmap ────────── + +pub(crate) struct PackedIntCol<'a> { + primary: &'a [u8], + overflow: &'a [u8], // raw bytes: n_overflow × OVERFLOW_ENTRY_SIZE + n_overflow: usize, + step: usize, + index: &'a [(usize, usize)], + n: usize, +} + +impl PackedIntCol<'_> { + fn overflow_get(&self, slot: usize) -> u32 { + let (pos_start, pos_end) = if self.step == 0 { + (0, self.n_overflow) + } else { + let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); + let start = self.index[i].1; + let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow }; + (start, end) + }; + let mut lo = pos_start; + let mut hi = pos_end; + while lo < hi { + let mid = lo + (hi - lo) / 2; + let (stored, val) = parse_overflow_entry(self.overflow, 0, mid); + match stored.cmp(&slot) { + Ordering::Equal => return val, + Ordering::Less => lo = mid + 1, + Ordering::Greater => hi = mid, + } + } + panic!("slot {slot} marked overflow but not found") + } +} + +impl IntSlice for PackedIntCol<'_> { + fn len(&self) -> usize { self.n } + + fn get(&self, slot: usize) -> u32 { + let v = self.primary[slot]; + if v < 255 { v as u32 } else { self.overflow_get(slot) } + } + + fn primary_bytes(&self) -> &[u8] { self.primary } + + fn overflow_entries(&self) -> impl Iterator + '_ { + (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i)) + } + + fn iter(&self) -> impl Iterator + '_ { + PackedIntColIter { + primary: self.primary, + overflow: self.overflow, + slot: 0, + overflow_pos: 0, + n: self.n, + } + } + + fn sum(&self) -> u64 { + byte_sum(self.primary, (0..self.n_overflow).map(|i| parse_overflow_entry(self.overflow, 0, i).1)) + } + + fn count_nonzero(&self) -> u64 { byte_count_nonzero(self.primary) } +} + +struct PackedIntColIter<'a> { + primary: &'a [u8], + overflow: &'a [u8], + slot: usize, + overflow_pos: usize, + n: usize, +} + +impl Iterator for PackedIntColIter<'_> { + type Item = u32; + + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = self.primary[self.slot]; + self.slot += 1; + if v < 255 { + Some(v as u32) + } else { + let (_, val) = parse_overflow_entry(self.overflow, 0, self.overflow_pos); + self.overflow_pos += 1; + Some(val) + } + } + + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} + +impl ExactSizeIterator for PackedIntColIter<'_> {} + +// ───────────────────────────────────────────────────────────────────────────── + pub struct PackedCompactIntMatrix { mmap: Mmap, n_rows: usize, @@ -148,10 +250,10 @@ impl PackedCompactIntMatrix { let off_pos = PCMX_HEADER + c * 8; let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize; // Parse embedded PCIV header at col_base - let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize; - let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize; - let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize; + let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize; + let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize; + let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize; + let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize; let primary_start = col_base + HEADER_SIZE; let data_offset = primary_start + n_pciv; @@ -167,57 +269,31 @@ impl PackedCompactIntMatrix { Ok(Self { mmap, n_rows, n_cols, columns }) } - fn col_overflow_map(&self, ci: &ColInfo) -> HashMap { - let mut overflow = HashMap::with_capacity(ci.n_overflow); - for i in 0..ci.n_overflow { - let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i); - overflow.insert(slot, value); + pub(crate) fn col_slice(&self, c: usize) -> PackedIntCol<'_> { + let ci = &self.columns[c]; + PackedIntCol { + primary: &self.mmap[ci.primary_start..ci.primary_start + self.n_rows], + overflow: &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE], + n_overflow: ci.n_overflow, + step: ci.step, + index: &ci.index, + n: self.n_rows, } - overflow } pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { - let ci = &self.columns[c]; - let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; - PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path) + let col = self.col_slice(c); + let overflow: HashMap = col.overflow_entries().collect(); + PersistentCompactIntVecBuilder::from_raw_primary(col.primary, overflow, path) } pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec { - let ci = &self.columns[c]; - let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec(); - MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci)) + MemoryIntVec::from(&self.col_slice(c)) } #[inline] pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { - let ci = &self.columns[col]; - let v = self.mmap[ci.primary_start + slot]; - if v < 255 { return v as u32; } - self.overflow_get(ci, slot) - } - - fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 { - let (pos_start, pos_end) = if ci.step == 0 { - (0, ci.n_overflow) - } else { - let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); - let start = ci.index[i].1; - let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow }; - (start, end) - }; - let mut lo = pos_start; - let mut hi = pos_end; - while lo < hi { - let mid = lo + (hi - lo) / 2; - let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE; - let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; - match stored.cmp(&slot) { - Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()), - Ordering::Less => lo = mid + 1, - Ordering::Greater => hi = mid, - } - } - panic!("slot {slot} marked overflow but not found") + self.col_slice(col).get(slot) } pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) { @@ -230,73 +306,62 @@ impl PackedCompactIntMatrix { pub(crate) fn sum(&self) -> Array1 { Array1::from_vec( - self.columns.par_iter() - .map(|ci| { - let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; - let overflow = (0..ci.n_overflow) - .map(|i| parse_overflow_entry(&self.mmap, ci.data_offset, i).1); - byte_sum(primary, overflow) - }) + (0..self.n_cols).into_par_iter() + .map(|c| self.col_slice(c).sum()) .collect() ) } pub(crate) fn count_nonzero(&self) -> Array1 { Array1::from_vec( - self.columns.par_iter() - .map(|ci| { - let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; - byte_count_nonzero(primary) - }) + (0..self.n_cols).into_par_iter() + .map(|c| self.col_slice(c).count_nonzero()) .collect() ) } - // ── Pair primitives ─────────────────────────────────────────────────────── + // ── Pair primitives — sequential scan via col_slice().iter() ───────────── fn pair_partial_bray(&self, i: usize, j: usize) -> u64 { - (0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum() + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .map(|(a, b)| a.min(b) as u64) + .sum() } fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 { - (0..self.n_rows).map(|s| { - let d = self.get(i, s) as f64 - self.get(j, s) as f64; - d * d - }).sum() + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) + .sum() } fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) { - let (mut inter, mut union) = (0u64, 0u64); - for s in 0..self.n_rows { - let a = self.get(i, s) >= t; - let b = self.get(j, s) >= t; - if a && b { inter += 1; } - if a || b { union += 1; } - } - (inter, union) + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .fold((0u64, 0u64), |(inter, uni), (a, b)| { + let ap = a >= t; + let bp = b >= t; + (inter + (ap & bp) as u64, uni + (ap | bp) as u64) + }) } fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - (self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj) - }).sum() + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)) + .sum() } fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj; - d * d - }).sum() + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }) + .sum() } fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt(); - d * d - }).sum() + self.col_slice(i).iter().zip(self.col_slice(j).iter()) + .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }) + .sum() } // ── Matrix methods ──────────────────────────────────────────────────────── @@ -324,7 +389,6 @@ impl PackedCompactIntMatrix { pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - } /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files. @@ -516,4 +580,3 @@ impl PersistentCompactIntMatrixBuilder { MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) } } -