From 1ec65922df342bb6356040fa5fb7e12c8c46f688 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 8 Jun 2026 20:04:51 +0200 Subject: [PATCH] feat: implement parallel pairwise distance matrices Introduces parallelized pairwise distance matrix computation for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics across `Columnar`, `Packed`, and `Implicit` matrix variants. Adds trait methods and convenience wrappers, safely handles normalization and zero-denominator edge cases, and updates test suites to import required traits for validation. --- src/obicompactvec/src/bitmatrix.rs | 55 ------------ src/obicompactvec/src/intmatrix.rs | 108 ----------------------- src/obicompactvec/src/tests/bitmatrix.rs | 1 + src/obicompactvec/src/tests/intmatrix.rs | 1 + src/obicompactvec/src/traits.rs | 4 + 5 files changed, 6 insertions(+), 163 deletions(-) diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index 269f247..631db63 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -53,14 +53,6 @@ impl ColumnarBitMatrix { Array1::from_vec(counts) } - pub(crate) fn jaccard_dist_matrix(&self) -> Array2 { - self.pairwise_f64(|i, j| self.col(i).jaccard_dist(self.col(j))) - } - - pub(crate) fn hamming_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) - } - pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { let n = self.n_cols(); let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) @@ -83,15 +75,6 @@ impl ColumnarBitMatrix { self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) } - fn pairwise_f64(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, f64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, f(i, j))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { let n = self.n_cols(); let results: Vec<(usize, usize, u64)> = upper_pairs(n) @@ -220,28 +203,6 @@ impl PackedBitMatrix { ) } - pub(crate) fn jaccard_dist_matrix(&self) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, f64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { - let (inter, union) = self.partial_jaccard_col(i, j); - let d = if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }; - (i, j, d) - }) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - - pub(crate) fn hamming_dist_matrix(&self) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, self.pair_op(i, j, false))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { let n = self.n_cols; let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) @@ -390,22 +351,6 @@ impl PersistentBitMatrix { } } - pub fn jaccard_dist_matrix(&self) -> Array2 { - match self { - Self::Columnar(m) => m.jaccard_dist_matrix(), - Self::Packed(m) => m.jaccard_dist_matrix(), - Self::Implicit { n_cols, .. } => Array2::zeros((*n_cols, *n_cols)), - } - } - - pub fn hamming_dist_matrix(&self) -> Array2 { - match self { - Self::Columnar(m) => m.hamming_dist_matrix(), - Self::Packed(m) => m.hamming_dist_matrix(), - Self::Implicit { n_cols, .. } => Array2::zeros((*n_cols, *n_cols)), - } - } - pub fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { match self { Self::Columnar(m) => m.partial_jaccard_dist_matrix(), diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index 198f210..37547c6 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -54,23 +54,6 @@ impl ColumnarCompactIntMatrix { Array1::from_vec(sums) } - pub(crate) fn bray_dist_matrix(&self) -> Array2 { - let sum_min = self.partial_bray_dist_matrix(); - let col_sums = self.sum(); - let n = self.n_cols(); - let mut m = Array2::zeros((n, n)); - for i in 0..n { - for j in 0..n { - if i != j { - let denom = col_sums[i] + col_sums[j]; - m[[i, j]] = if denom == 0 { 0.0 } - else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 }; - } - } - } - m - } - pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j))) } @@ -119,30 +102,6 @@ impl ColumnarCompactIntMatrix { }) } - pub(crate) fn relfreq_bray_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j))) - } - - pub(crate) fn euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j))) - } - - pub(crate) fn relfreq_euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j))) - } - - pub(crate) fn hellinger_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j))) - } - - pub(crate) fn jaccard_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j))) - } - - pub(crate) fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 { - self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold)) - } - pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> { let mut meta = MatrixMeta::load(dir)?; let mut b = PersistentCompactIntVecBuilder::new(meta.n, &col_path(dir, meta.n_cols))?; @@ -343,29 +302,11 @@ impl PackedCompactIntMatrix { self.pairwise_u64(|i, j| self.pair_partial_bray(i, j)) } - pub(crate) fn bray_dist_matrix(&self) -> Array2 { - let col_sums = self.sum(); - let sum_min = self.partial_bray_dist_matrix(); - let n = self.n_cols; - let mut m = Array2::zeros((n, n)); - for i in 0..n { for j in 0..n { - if i != j { - let denom = col_sums[i] + col_sums[j]; - m[[i, j]] = if denom == 0 { 0.0 } - else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 }; - } - }} - m - } pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { self.pairwise(|i, j| self.pair_partial_euclidean(i, j)) } - pub(crate) fn euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.pair_partial_euclidean(i, j).sqrt()) - } - pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2, Array2) { let n = self.n_cols; let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) @@ -381,46 +322,18 @@ impl PackedCompactIntMatrix { (inter_m, union_m) } - pub(crate) fn jaccard_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| { - let (inter, union) = self.pair_partial_threshold_jaccard(i, j, 1); - if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } - }) - } - - pub(crate) fn threshold_jaccard_dist_matrix(&self, t: u32) -> Array2 { - self.pairwise(|i, j| { - let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); - if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } - }) - } - pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn relfreq_bray_dist_matrix(&self) -> Array2 { - let col_sums = self.sum(); - self.partial_relfreq_bray_dist_matrix(&col_sums) - } - pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn relfreq_euclidean_dist_matrix(&self) -> Array2 { - let col_sums = self.sum(); - self.partial_relfreq_euclidean_dist_matrix(&col_sums) - } - pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn hellinger_dist_matrix(&self) -> Array2 { - let col_sums = self.sum(); - self.partial_hellinger_euclidean_dist_matrix(&col_sums) - } } /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files. @@ -508,27 +421,6 @@ impl PersistentCompactIntMatrix { match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() } } - pub fn bray_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.bray_dist_matrix(), Self::Packed(m) => m.bray_dist_matrix() } - } - pub fn relfreq_bray_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.relfreq_bray_dist_matrix(), Self::Packed(m) => m.relfreq_bray_dist_matrix() } - } - pub fn euclidean_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.euclidean_dist_matrix(), Self::Packed(m) => m.euclidean_dist_matrix() } - } - pub fn relfreq_euclidean_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.relfreq_euclidean_dist_matrix(), Self::Packed(m) => m.relfreq_euclidean_dist_matrix() } - } - pub fn hellinger_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.hellinger_dist_matrix(), Self::Packed(m) => m.hellinger_dist_matrix() } - } - pub fn jaccard_dist_matrix(&self) -> Array2 { - match self { Self::Columnar(m) => m.jaccard_dist_matrix(), Self::Packed(m) => m.jaccard_dist_matrix() } - } - pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 { - match self { Self::Columnar(m) => m.threshold_jaccard_dist_matrix(threshold), Self::Packed(m) => m.threshold_jaccard_dist_matrix(threshold) } - } pub fn partial_bray_dist_matrix(&self) -> Array2 { match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() } } diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index d825d85..741a07c 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -1,6 +1,7 @@ use tempfile::tempdir; use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; +use crate::traits::BitPartials; fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { let n = cols.first().map_or(0, |c| c.len()); diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs index b9e1841..c4c0a98 100644 --- a/src/obicompactvec/src/tests/intmatrix.rs +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -1,6 +1,7 @@ use tempfile::tempdir; use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; +use crate::traits::CountPartials; fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { let n = cols.first().map_or(0, |c| c.len()); diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index 7cccf48..f65ad9d 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -46,6 +46,10 @@ pub trait CountPartials: ColumnWeights { self.partial_euclidean().mapv(|v| v.sqrt()) } + fn jaccard_dist_matrix(&self) -> Array2 { + self.threshold_jaccard_dist_matrix(1) + } + fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 { let (inter, union) = self.partial_threshold_jaccard(threshold); let n = inter.shape()[0];