feat: add parallel distance matrix computation for bit and int matrices
Introduce parallel distance matrix generation using `ndarray` and `rayon` for both `BitMatrix` and `IntMatrix`. Adds full and additive-partial variants for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics. Includes comprehensive unit tests verifying matrix symmetry, zero diagonals, and numerical correctness against pairwise calculations.
This commit is contained in:
@@ -1,5 +1,8 @@
|
||||
use std::{fs, io, path::{Path, PathBuf}};
|
||||
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
@@ -29,8 +32,130 @@ impl PersistentCompactIntMatrix {
|
||||
pub fn row(&self, slot: usize) -> Box<[u32]> {
|
||||
self.cols.iter().map(|c| c.get(slot)).collect()
|
||||
}
|
||||
|
||||
// ── Distance matrices ─────────────────────────────────────────────────────
|
||||
|
||||
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
// ── Partial matrices (additively decomposable across layers) ──────────────
|
||||
|
||||
/// Returns `(sum_min[n×n], col_sums[n])`.
|
||||
/// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
|
||||
/// `col_sums[k]` = Σ_slot col_k[slot].
|
||||
/// Reduce across layers by element-wise addition before computing the final distance.
|
||||
pub fn partial_bray_dist_matrix(&self) -> (Array2<u64>, Array1<u64>) {
|
||||
let n = self.n_cols();
|
||||
|
||||
let col_sums: Vec<u64> = (0..n)
|
||||
.into_par_iter()
|
||||
.map(|i| self.col(i).sum())
|
||||
.collect();
|
||||
|
||||
let sum_min = self.pairwise_u64(|i, j| {
|
||||
self.col(i).partial_bray_dist(self.col(j)).0
|
||||
});
|
||||
|
||||
(sum_min, Array1::from_vec(col_sums))
|
||||
}
|
||||
|
||||
/// Returns sum of squared differences `[n×n]`.
|
||||
/// Reduce across layers by element-wise addition, then take `sqrt` for the final distance.
|
||||
pub fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
/// Returns `(inter[n×n], union[n×n])` for threshold-Jaccard.
|
||||
/// Reduce across layers by element-wise addition before computing `1 - inter/union`.
|
||||
pub fn partial_threshold_jaccard_dist_matrix(
|
||||
&self,
|
||||
threshold: u32,
|
||||
) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let pairs = upper_pairs(n);
|
||||
|
||||
let results: Vec<(usize, usize, u64, u64)> = pairs
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
let (inter, union) =
|
||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
|
||||
(i, j, inter, union)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
}
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
// ── Private helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
}
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
}
|
||||
|
||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
||||
where
|
||||
T: Clone + Default,
|
||||
{
|
||||
let mut m = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, vij, vji) in vals {
|
||||
m[[i, j]] = vij;
|
||||
m[[j, i]] = vji;
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentCompactIntMatrixBuilder {
|
||||
dir: PathBuf,
|
||||
n: usize,
|
||||
|
||||
Reference in New Issue
Block a user