feat: Add parallel column counts and partial distance metrics

Introduces parallel `count_ones` for `BitMatrix` and parallel column-sum aggregation alongside three pairwise distance constructors (Bray-Curtis, Euclidean, Hellinger) for `IntMatrix`. These methods support partial, layer-wise data by accepting precomputed global column sums for normalization, enabling additive decomposition across partitions. Includes unit tests verifying mathematical equivalence and partition additivity.
This commit is contained in:
Eric Coissac
2026-05-15 20:41:51 +08:00
parent 8bee9f3017
commit 8409c852ef
4 changed files with 151 additions and 1 deletions
+10 -1
View File
@@ -1,6 +1,6 @@
use std::{fs, io, path::{Path, PathBuf}};
use ndarray::{Array2};
use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
@@ -32,6 +32,15 @@ impl PersistentBitMatrix {
self.cols.iter().map(|c| c.get(slot)).collect()
}
/// Returns the number of set bits in each column as `Array1<u64>`.
pub fn count_ones(&self) -> Array1<u64> {
let counts: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).count_ones())
.collect();
Array1::from_vec(counts)
}
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
+48
View File
@@ -63,6 +63,15 @@ impl PersistentCompactIntMatrix {
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
}
/// Returns the sum of each column as `Array1<u64>`.
pub fn sum(&self) -> Array1<u64> {
let sums: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).sum())
.collect();
Array1::from_vec(sums)
}
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
@@ -117,6 +126,45 @@ impl PersistentCompactIntMatrix {
(inter_m, union_m)
}
/// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `1 - value`.
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_relfreq_bray_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
/// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `sqrt(value)`.
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_relfreq_euclidean_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
/// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`.
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_hellinger_euclidean_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
// ── Private helpers ───────────────────────────────────────────────────────
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
+15
View File
@@ -84,6 +84,21 @@ fn zero_cols_roundtrip() {
assert_eq!(m.n(), 8);
}
// ── count_ones ────────────────────────────────────────────────────────────────
#[test]
fn count_ones_per_column() {
let (_d, m) = make_matrix(&[
&[true, false, true, true], // 3 ones
&[false, false, false, false], // 0 ones
&[true, true, true, false], // 3 ones
]);
let c = m.count_ones();
assert_eq!(c[0], 3);
assert_eq!(c[1], 0);
assert_eq!(c[2], 3);
}
// ── Distance matrix tests ─────────────────────────────────────────────────────
#[test]
+78
View File
@@ -190,3 +190,81 @@ fn single_col_distance_matrix_is_zero() {
assert_eq!(dm.shape(), &[1, 1]);
assert_eq!(dm[[0, 0]], 0.0);
}
#[test]
fn sum_returns_column_sums() {
let (_d, m) = make_matrix(&[&[1, 2, 3], &[10, 20, 30], &[0, 0, 5]]);
let s = m.sum();
assert_eq!(s[0], 6);
assert_eq!(s[1], 60);
assert_eq!(s[2], 5);
}
#[test]
fn partial_relfreq_bray_matches_full() {
let (_d, m) = make_matrix(&[&[1, 0, 2], &[0, 1, 1], &[1, 1, 0]]);
let col_sums = m.sum();
let partial = m.partial_relfreq_bray_dist_matrix(&col_sums);
let full = m.relfreq_bray_dist_matrix();
let n = m.n_cols();
// partial[i,j] = sum_min_relfreq; full[i,j] = 1 - sum_min_relfreq (off-diagonal only)
for i in 0..n {
for j in 0..n {
if i == j { continue; }
assert!((partial[[i, j]] - (1.0 - full[[i, j]])).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn partial_relfreq_euclidean_matches_full() {
let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]);
let col_sums = m.sum();
let partial = m.partial_relfreq_euclidean_dist_matrix(&col_sums);
let full = m.relfreq_euclidean_dist_matrix();
let n = m.n_cols();
for i in 0..n {
for j in 0..n {
// partial = squared euclidean; full = sqrt(partial)
assert!((partial[[i, j]].sqrt() - full[[i, j]]).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn partial_hellinger_matches_full() {
let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]);
let col_sums = m.sum();
let partial = m.partial_hellinger_euclidean_dist_matrix(&col_sums);
let full = m.hellinger_dist_matrix();
let n = m.n_cols();
for i in 0..n {
for j in 0..n {
// partial / sqrt(2) gives the Hellinger distance
assert!((partial[[i, j]].sqrt() / std::f64::consts::SQRT_2 - full[[i, j]]).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn partial_relfreq_bray_additive_across_split() {
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
// global sums: col0=15, col1=15
// split: first=[1,2], second=[3,4,5]
let (_d1, m1) = make_matrix(&[&[1, 2], &[5, 4]]);
let (_d2, m2) = make_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
let global_sums = ndarray::array![15u64, 15u64];
let p1 = m1.partial_relfreq_bray_dist_matrix(&global_sums);
let p2 = m2.partial_relfreq_bray_dist_matrix(&global_sums);
let combined = &p1 + &p2;
let (_d, m_full) = make_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
let full_partial = m_full.partial_relfreq_bray_dist_matrix(&global_sums);
let n = m_full.n_cols();
for i in 0..n {
for j in 0..n {
assert!((combined[[i, j]] - full_partial[[i, j]]).abs() < 1e-12, "[{i},{j}]");
}
}
}