feat: add parallel distance matrix computation for bit and int matrices

Introduce parallel distance matrix generation using `ndarray` and `rayon` for both `BitMatrix` and `IntMatrix`. Adds full and additive-partial variants for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics. Includes comprehensive unit tests verifying matrix symmetry, zero diagonals, and numerical correctness against pairwise calculations.
This commit is contained in:
Eric Coissac
2026-05-15 17:18:02 +08:00
parent 1881e98bad
commit 8bee9f3017
6 changed files with 488 additions and 0 deletions
+125
View File
@@ -1,5 +1,8 @@
use std::{fs, io, path::{Path, PathBuf}};
use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -29,8 +32,130 @@ impl PersistentCompactIntMatrix {
pub fn row(&self, slot: usize) -> Box<[u32]> {
self.cols.iter().map(|c| c.get(slot)).collect()
}
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).bray_dist(self.col(j)))
}
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
}
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
}
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
}
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
}
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
}
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
}
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
/// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// `col_sums[k]` = Σ_slot col_k[slot].
/// Reduce across layers by element-wise addition before computing the final distance.
pub fn partial_bray_dist_matrix(&self) -> (Array2<u64>, Array1<u64>) {
let n = self.n_cols();
let col_sums: Vec<u64> = (0..n)
.into_par_iter()
.map(|i| self.col(i).sum())
.collect();
let sum_min = self.pairwise_u64(|i, j| {
self.col(i).partial_bray_dist(self.col(j)).0
});
(sum_min, Array1::from_vec(col_sums))
}
/// Returns sum of squared differences `[n×n]`.
/// Reduce across layers by element-wise addition, then take `sqrt` for the final distance.
pub fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
}
/// Returns `(inter[n×n], union[n×n])` for threshold-Jaccard.
/// Reduce across layers by element-wise addition before computing `1 - inter/union`.
pub fn partial_threshold_jaccard_dist_matrix(
&self,
threshold: u32,
) -> (Array2<u64>, Array2<u64>) {
let n = self.n_cols();
let pairs = upper_pairs(n);
let results: Vec<(usize, usize, u64, u64)> = pairs
.into_par_iter()
.map(|(i, j)| {
let (inter, union) =
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
(i, j, inter, union)
})
.collect();
let mut inter_m = Array2::zeros((n, n));
let mut union_m = Array2::zeros((n, n));
for (i, j, inter, union) in results {
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
union_m[[i, j]] = union; union_m[[j, i]] = union;
}
(inter_m, union_m)
}
// ── Private helpers ───────────────────────────────────────────────────────
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
let n = self.n_cols();
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
let n = self.n_cols();
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
}
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
}
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
where
T: Clone + Default,
{
let mut m = Array2::from_elem((n, n), T::default());
for (i, j, vij, vji) in vals {
m[[i, j]] = vij;
m[[j, i]] = vji;
}
m
}
// ── Builder ───────────────────────────────────────────────────────────────────
pub struct PersistentCompactIntMatrixBuilder {
dir: PathBuf,
n: usize,