feat: Add parallel column counts and partial distance metrics
Introduces parallel `count_ones` for `BitMatrix` and parallel column-sum aggregation alongside three pairwise distance constructors (Bray-Curtis, Euclidean, Hellinger) for `IntMatrix`. These methods support partial, layer-wise data by accepting precomputed global column sums for normalization, enabling additive decomposition across partitions. Includes unit tests verifying mathematical equivalence and partition additivity.
This commit is contained in:
@@ -63,6 +63,15 @@ impl PersistentCompactIntMatrix {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
/// Returns the sum of each column as `Array1<u64>`.
|
||||
pub fn sum(&self) -> Array1<u64> {
|
||||
let sums: Vec<u64> = (0..self.n_cols())
|
||||
.into_par_iter()
|
||||
.map(|c| self.col(c).sum())
|
||||
.collect();
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
// ── Partial matrices (additively decomposable across layers) ──────────────
|
||||
|
||||
/// Returns `(sum_min[n×n], col_sums[n])`.
|
||||
@@ -117,6 +126,45 @@ impl PersistentCompactIntMatrix {
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot min(col_i[slot]/sum_i, col_j[slot]/sum_j)` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `1 - value`.
|
||||
pub fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (col_i[slot]/sum_i - col_j[slot]/sum_j)²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value)`.
|
||||
pub fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns matrix of `Σ_slot (√(col_i/sum_i) - √(col_j/sum_j))²` per pair.
|
||||
/// `col_sums` must be the GLOBAL sums across all layers/partitions.
|
||||
/// Reduce across layers by element-wise addition; final distance = `sqrt(value) / √2`.
|
||||
pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(
|
||||
self.col(j),
|
||||
col_sums[i] as f64,
|
||||
col_sums[j] as f64,
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
// ── Private helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
|
||||
Reference in New Issue
Block a user