feat: Add parallel column counts and partial distance metrics

Introduces parallel `count_ones` for `BitMatrix` and parallel column-sum aggregation alongside three pairwise distance constructors (Bray-Curtis, Euclidean, Hellinger) for `IntMatrix`. These methods support partial, layer-wise data by accepting precomputed global column sums for normalization, enabling additive decomposition across partitions. Includes unit tests verifying mathematical equivalence and partition additivity.
This commit is contained in:
Eric Coissac
2026-05-15 20:41:51 +08:00
parent 8bee9f3017
commit 8409c852ef
4 changed files with 151 additions and 1 deletions
+48
View File
@@ -63,6 +63,15 @@ impl PersistentCompactIntMatrix {
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
}
/// Returns the sum of each column as `Array1<u64>`.
pub fn sum(&self) -> Array1<u64> {
let sums: Vec<u64> = (0..self.n_cols())
.into_par_iter()
.map(|c| self.col(c).sum())
.collect();
Array1::from_vec(sums)
}
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
@@ -117,6 +126,45 @@ impl PersistentCompactIntMatrix {
(inter_m, union_m)
}
/// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `1 - value`.
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_relfreq_bray_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
/// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `sqrt(value)`.
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_relfreq_euclidean_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
/// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair.
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
/// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`.
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
self.col(i).partial_hellinger_euclidean_dist(
self.col(j),
col_sums[i] as f64,
col_sums[j] as f64,
)
})
}
// ── Private helpers ───────────────────────────────────────────────────────
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {