feat: Add parallel column counts and partial distance metrics
Introduces parallel `count_ones` for `BitMatrix` and parallel column-sum aggregation alongside three pairwise distance constructors (Bray-Curtis, Euclidean, Hellinger) for `IntMatrix`. These methods support partial, layer-wise data by accepting precomputed global column sums for normalization, enabling additive decomposition across partitions. Includes unit tests verifying mathematical equivalence and partition additivity.
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
use std::{fs, io, path::{Path, PathBuf}};
|
||||
|
||||
use ndarray::{Array2};
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
|
||||
@@ -32,6 +32,15 @@ impl PersistentBitMatrix {
|
||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||
}
|
||||
|
||||
/// Returns the number of set bits in each column as `Array1<u64>`.
|
||||
pub fn count_ones(&self) -> Array1<u64> {
|
||||
let counts: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).count_ones())
|
||||
.collect();
|
||||
Array1::from_vec(counts)
|
||||
}
|
||||
|
||||
// ── Distance matrices ─────────────────────────────────────────────────────
|
||||
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
|
||||
@@ -63,6 +63,15 @@ impl PersistentCompactIntMatrix {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
/// Returns the sum of each column as `Array1<u64>`.
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
let sums: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).sum())
|
||||
.collect();
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
// ── Partial matrices (additively decomposable across layers) ──────────────
|
||||
|
||||
/// Returns `(sum_min[n×n], col_sums[n])`.
|
||||
@@ -117,6 +126,45 @@ impl PersistentCompactIntMatrix {
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `1 - value`.
|
||||
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value)`.
|
||||
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`.
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// ── Private helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
|
||||
@@ -84,6 +84,21 @@ fn zero_cols_roundtrip() {
|
||||
assert_eq!(m.n(), 8);
|
||||
}
|
||||
|
||||
// ── count_ones ────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_ones_per_column() {
|
||||
let (_d, m) = make_matrix(&[
|
||||
&[true, false, true, true], // 3 ones
|
||||
&[false, false, false, false], // 0 ones
|
||||
&[true, true, true, false], // 3 ones
|
||||
]);
|
||||
let c = m.count_ones();
|
||||
assert_eq!(c[0], 3);
|
||||
assert_eq!(c[1], 0);
|
||||
assert_eq!(c[2], 3);
|
||||
}
|
||||
|
||||
// ── Distance matrix tests ─────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -190,3 +190,81 @@ fn single_col_distance_matrix_is_zero() {
|
||||
assert_eq!(dm.shape(), &[1, 1]);
|
||||
assert_eq!(dm[[0, 0]], 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sum_returns_column_sums() {
|
||||
let (_d, m) = make_matrix(&[&[1, 2, 3], &[10, 20, 30], &[0, 0, 5]]);
|
||||
let s = m.sum();
|
||||
assert_eq!(s[0], 6);
|
||||
assert_eq!(s[1], 60);
|
||||
assert_eq!(s[2], 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_relfreq_bray_matches_full() {
|
||||
let (_d, m) = make_matrix(&[&[1, 0, 2], &[0, 1, 1], &[1, 1, 0]]);
|
||||
let col_sums = m.sum();
|
||||
let partial = m.partial_relfreq_bray_dist_matrix(&col_sums);
|
||||
let full = m.relfreq_bray_dist_matrix();
|
||||
let n = m.n_cols();
|
||||
// partial[i,j] = sum_min_relfreq; full[i,j] = 1 - sum_min_relfreq (off-diagonal only)
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i == j { continue; }
|
||||
assert!((partial[[i, j]] - (1.0 - full[[i, j]])).abs() < 1e-12, "[{i},{j}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_relfreq_euclidean_matches_full() {
|
||||
let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]);
|
||||
let col_sums = m.sum();
|
||||
let partial = m.partial_relfreq_euclidean_dist_matrix(&col_sums);
|
||||
let full = m.relfreq_euclidean_dist_matrix();
|
||||
let n = m.n_cols();
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
// partial = squared euclidean; full = sqrt(partial)
|
||||
assert!((partial[[i, j]].sqrt() - full[[i, j]]).abs() < 1e-12, "[{i},{j}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_hellinger_matches_full() {
|
||||
let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]);
|
||||
let col_sums = m.sum();
|
||||
let partial = m.partial_hellinger_euclidean_dist_matrix(&col_sums);
|
||||
let full = m.hellinger_dist_matrix();
|
||||
let n = m.n_cols();
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
// partial / sqrt(2) gives the Hellinger distance
|
||||
assert!((partial[[i, j]].sqrt() / std::f64::consts::SQRT_2 - full[[i, j]]).abs() < 1e-12, "[{i},{j}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_relfreq_bray_additive_across_split() {
|
||||
// Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
|
||||
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
||||
// global sums: col0=15, col1=15
|
||||
// split: first=[1,2], second=[3,4,5]
|
||||
let (_d1, m1) = make_matrix(&[&[1, 2], &[5, 4]]);
|
||||
let (_d2, m2) = make_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
||||
let global_sums = ndarray::array![15u64, 15u64];
|
||||
let p1 = m1.partial_relfreq_bray_dist_matrix(&global_sums);
|
||||
let p2 = m2.partial_relfreq_bray_dist_matrix(&global_sums);
|
||||
let combined = &p1 + &p2;
|
||||
|
||||
let (_d, m_full) = make_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let full_partial = m_full.partial_relfreq_bray_dist_matrix(&global_sums);
|
||||
let n = m_full.n_cols();
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
assert!((combined[[i, j]] - full_partial[[i, j]]).abs() < 1e-12, "[{i},{j}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user