feat: implement parallel pairwise distance matrices
Introduces parallelized pairwise distance matrix computation for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics across `Columnar`, `Packed`, and `Implicit` matrix variants. Adds trait methods and convenience wrappers, safely handles normalization and zero-denominator edge cases, and updates test suites to import required traits for validation.
This commit is contained in:
@@ -53,14 +53,6 @@ impl ColumnarBitMatrix {
|
||||
Array1::from_vec(counts)
|
||||
}
|
||||
|
||||
pub(crate) fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise_f64(|i, j| self.col(i).jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
@@ -83,15 +75,6 @@ impl ColumnarBitMatrix {
|
||||
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
|
||||
}
|
||||
|
||||
fn pairwise_f64(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, f(i, j)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
@@ -220,28 +203,6 @@ impl PackedBitMatrix {
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
let (inter, union) = self.partial_jaccard_col(i, j);
|
||||
let d = if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 };
|
||||
(i, j, d)
|
||||
})
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
pub(crate) fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| (i, j, self.pair_op(i, j, false)))
|
||||
.collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
@@ -390,22 +351,6 @@ impl PersistentBitMatrix {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.jaccard_dist_matrix(),
|
||||
Self::Packed(m) => m.jaccard_dist_matrix(),
|
||||
Self::Implicit { n_cols, .. } => Array2::zeros((*n_cols, *n_cols)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
match self {
|
||||
Self::Columnar(m) => m.hamming_dist_matrix(),
|
||||
Self::Packed(m) => m.hamming_dist_matrix(),
|
||||
Self::Implicit { n_cols, .. } => Array2::zeros((*n_cols, *n_cols)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
match self {
|
||||
Self::Columnar(m) => m.partial_jaccard_dist_matrix(),
|
||||
|
||||
@@ -54,23 +54,6 @@ impl ColumnarCompactIntMatrix {
|
||||
Array1::from_vec(sums)
|
||||
}
|
||||
|
||||
pub(crate) fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let sum_min = self.partial_bray_dist_matrix();
|
||||
let col_sums = self.sum();
|
||||
let n = self.n_cols();
|
||||
let mut m = Array2::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let denom = col_sums[i] + col_sums[j];
|
||||
m[[i, j]] = if denom == 0 { 0.0 }
|
||||
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
@@ -119,30 +102,6 @@ impl ColumnarCompactIntMatrix {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
|
||||
}
|
||||
|
||||
pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
|
||||
let mut meta = MatrixMeta::load(dir)?;
|
||||
let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?;
|
||||
@@ -343,29 +302,11 @@ impl PackedCompactIntMatrix {
|
||||
self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let col_sums = self.sum();
|
||||
let sum_min = self.partial_bray_dist_matrix();
|
||||
let n = self.n_cols;
|
||||
let mut m = Array2::zeros((n, n));
|
||||
for i in 0..n { for j in 0..n {
|
||||
if i != j {
|
||||
let denom = col_sums[i] + col_sums[j];
|
||||
m[[i, j]] = if denom == 0 { 0.0 }
|
||||
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 };
|
||||
}
|
||||
}}
|
||||
m
|
||||
}
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_euclidean(i, j).sqrt())
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
@@ -381,46 +322,18 @@ impl PackedCompactIntMatrix {
|
||||
(inter_m, union_m)
|
||||
}
|
||||
|
||||
pub(crate) fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
let (inter, union) = self.pair_partial_threshold_jaccard(i, j, 1);
|
||||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn threshold_jaccard_dist_matrix(&self, t: u32) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t);
|
||||
if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let col_sums = self.sum();
|
||||
self.partial_relfreq_bray_dist_matrix(&col_sums)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
let col_sums = self.sum();
|
||||
self.partial_relfreq_euclidean_dist_matrix(&col_sums)
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
let col_sums = self.sum();
|
||||
self.partial_hellinger_euclidean_dist_matrix(&col_sums)
|
||||
}
|
||||
}
|
||||
|
||||
/// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
|
||||
@@ -508,27 +421,6 @@ impl PersistentCompactIntMatrix {
|
||||
match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
|
||||
}
|
||||
|
||||
pub fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.bray_dist_matrix(), Self::Packed(m) => m.bray_dist_matrix() }
|
||||
}
|
||||
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.relfreq_bray_dist_matrix(), Self::Packed(m) => m.relfreq_bray_dist_matrix() }
|
||||
}
|
||||
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.euclidean_dist_matrix(), Self::Packed(m) => m.euclidean_dist_matrix() }
|
||||
}
|
||||
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.relfreq_euclidean_dist_matrix(), Self::Packed(m) => m.relfreq_euclidean_dist_matrix() }
|
||||
}
|
||||
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.hellinger_dist_matrix(), Self::Packed(m) => m.hellinger_dist_matrix() }
|
||||
}
|
||||
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.jaccard_dist_matrix(), Self::Packed(m) => m.jaccard_dist_matrix() }
|
||||
}
|
||||
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
match self { Self::Columnar(m) => m.threshold_jaccard_dist_matrix(threshold), Self::Packed(m) => m.threshold_jaccard_dist_matrix(threshold) }
|
||||
}
|
||||
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
use crate::traits::BitPartials;
|
||||
|
||||
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
use crate::traits::CountPartials;
|
||||
|
||||
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
|
||||
@@ -46,6 +46,10 @@ pub trait CountPartials: ColumnWeights {
|
||||
self.partial_euclidean().mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
self.threshold_jaccard_dist_matrix(1)
|
||||
}
|
||||
|
||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_threshold_jaccard(threshold);
|
||||
let n = inter.shape()[0];
|
||||
|
||||
Reference in New Issue
Block a user