From 8409c852ef1cb686db08092a6aea6665243e9148 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 15 May 2026 20:41:51 +0800 Subject: [PATCH] feat: Add parallel column counts and partial distance metrics Introduces parallel `count_ones` for `BitMatrix` and parallel column-sum aggregation alongside three pairwise distance constructors (Bray-Curtis, Euclidean, Hellinger) for `IntMatrix`. These methods support partial, layer-wise data by accepting precomputed global column sums for normalization, enabling additive decomposition across partitions. Includes unit tests verifying mathematical equivalence and partition additivity. --- src/obicompactvec/src/bitmatrix.rs | 11 +++- src/obicompactvec/src/intmatrix.rs | 48 +++++++++++++++ src/obicompactvec/src/tests/bitmatrix.rs | 15 +++++ src/obicompactvec/src/tests/intmatrix.rs | 78 ++++++++++++++++++++++++ 4 files changed, 151 insertions(+), 1 deletion(-) diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 7a10219..f6e76b2 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -1,6 +1,6 @@ use std::{fs, io, path::{Path, PathBuf}}; -use ndarray::{Array2}; +use ndarray::{Array1, Array2}; use rayon::prelude::*; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; @@ -32,6 +32,15 @@ impl PersistentBitMatrix { self.cols.iter().map(|c| c.get(slot)).collect() } + /// Returns the number of set bits in each column as `Array1`. + pub fn count_ones(&self) -> Array1 { + let counts: Vec = (0..self.n_cols()) + .into_par_iter() + .map(|c| self.col(c).count_ones()) + .collect(); + Array1::from_vec(counts) + } + // ── Distance matrices ───────────────────────────────────────────────────── pub fn jaccard_dist_matrix(&self) -> Array2 { diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 5cadd19..4f6742f 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -63,6 +63,15 @@ impl PersistentCompactIntMatrix { self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold)) } + /// Returns the sum of each column as `Array1`. + pub fn sum(&self) -> Array1 { + let sums: Vec = (0..self.n_cols()) + .into_par_iter() + .map(|c| self.col(c).sum()) + .collect(); + Array1::from_vec(sums) + } + // ── Partial matrices (additively decomposable across layers) ────────────── /// Returns `(sum_min[n×n], col_sums[n])`. @@ -117,6 +126,45 @@ impl PersistentCompactIntMatrix { (inter_m, union_m) } + /// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair. + /// `col_sums` must be the GLOBAL sums across all layers/partitions. + /// Reduce across layers by element-wise addition; final distance = `1 - value`. + pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { + self.pairwise(|i, j| { + self.col(i).partial_relfreq_bray_dist( + self.col(j), + col_sums[i] as f64, + col_sums[j] as f64, + ) + }) + } + + /// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair. + /// `col_sums` must be the GLOBAL sums across all layers/partitions. + /// Reduce across layers by element-wise addition; final distance = `sqrt(value)`. + pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { + self.pairwise(|i, j| { + self.col(i).partial_relfreq_euclidean_dist( + self.col(j), + col_sums[i] as f64, + col_sums[j] as f64, + ) + }) + } + + /// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair. + /// `col_sums` must be the GLOBAL sums across all layers/partitions. + /// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`. + pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { + self.pairwise(|i, j| { + self.col(i).partial_hellinger_euclidean_dist( + self.col(j), + col_sums[i] as f64, + col_sums[j] as f64, + ) + }) + } + // ── Private helpers ─────────────────────────────────────────────────────── fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index bfb6e34..f5845d6 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -84,6 +84,21 @@ fn zero_cols_roundtrip() { assert_eq!(m.n(), 8); } +// ── count_ones ──────────────────────────────────────────────────────────────── + +#[test] +fn count_ones_per_column() { + let (_d, m) = make_matrix(&[ + &[true, false, true, true], // 3 ones + &[false, false, false, false], // 0 ones + &[true, true, true, false], // 3 ones + ]); + let c = m.count_ones(); + assert_eq!(c[0], 3); + assert_eq!(c[1], 0); + assert_eq!(c[2], 3); +} + // ── Distance matrix tests ───────────────────────────────────────────────────── #[test] diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs index ced72c0..c195b42 100644 --- a/src/obicompactvec/src/tests/intmatrix.rs +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -190,3 +190,81 @@ fn single_col_distance_matrix_is_zero() { assert_eq!(dm.shape(), &[1, 1]); assert_eq!(dm[[0, 0]], 0.0); } + +#[test] +fn sum_returns_column_sums() { + let (_d, m) = make_matrix(&[&[1, 2, 3], &[10, 20, 30], &[0, 0, 5]]); + let s = m.sum(); + assert_eq!(s[0], 6); + assert_eq!(s[1], 60); + assert_eq!(s[2], 5); +} + +#[test] +fn partial_relfreq_bray_matches_full() { + let (_d, m) = make_matrix(&[&[1, 0, 2], &[0, 1, 1], &[1, 1, 0]]); + let col_sums = m.sum(); + let partial = m.partial_relfreq_bray_dist_matrix(&col_sums); + let full = m.relfreq_bray_dist_matrix(); + let n = m.n_cols(); + // partial[i,j] = sum_min_relfreq; full[i,j] = 1 - sum_min_relfreq (off-diagonal only) + for i in 0..n { + for j in 0..n { + if i == j { continue; } + assert!((partial[[i, j]] - (1.0 - full[[i, j]])).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn partial_relfreq_euclidean_matches_full() { + let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]); + let col_sums = m.sum(); + let partial = m.partial_relfreq_euclidean_dist_matrix(&col_sums); + let full = m.relfreq_euclidean_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { + for j in 0..n { + // partial = squared euclidean; full = sqrt(partial) + assert!((partial[[i, j]].sqrt() - full[[i, j]]).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn partial_hellinger_matches_full() { + let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]); + let col_sums = m.sum(); + let partial = m.partial_hellinger_euclidean_dist_matrix(&col_sums); + let full = m.hellinger_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { + for j in 0..n { + // partial / sqrt(2) gives the Hellinger distance + assert!((partial[[i, j]].sqrt() / std::f64::consts::SQRT_2 - full[[i, j]]).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn partial_relfreq_bray_additive_across_split() { + // Split rows [1,2,3,4,5] between two matrices; partial sums should add up. + // col0=[1,2,3,4,5], col1=[5,4,3,2,1] + // global sums: col0=15, col1=15 + // split: first=[1,2], second=[3,4,5] + let (_d1, m1) = make_matrix(&[&[1, 2], &[5, 4]]); + let (_d2, m2) = make_matrix(&[&[3, 4, 5], &[3, 2, 1]]); + let global_sums = ndarray::array![15u64, 15u64]; + let p1 = m1.partial_relfreq_bray_dist_matrix(&global_sums); + let p2 = m2.partial_relfreq_bray_dist_matrix(&global_sums); + let combined = &p1 + &p2; + + let (_d, m_full) = make_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]); + let full_partial = m_full.partial_relfreq_bray_dist_matrix(&global_sums); + let n = m_full.n_cols(); + for i in 0..n { + for j in 0..n { + assert!((combined[[i, j]] - full_partial[[i, j]]).abs() < 1e-12, "[{i},{j}]"); + } + } +}