From 8bee9f301798fa777b02041ca4550401ab6e967f Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 15 May 2026 17:18:02 +0800 Subject: [PATCH] feat: add parallel distance matrix computation for bit and int matrices Introduce parallel distance matrix generation using `ndarray` and `rayon` for both `BitMatrix` and `IntMatrix`. Adds full and additive-partial variants for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics. Includes comprehensive unit tests verifying matrix symmetry, zero diagonals, and numerical correctness against pairwise calculations. --- src/Cargo.lock | 42 ++++++++ src/obicompactvec/Cargo.toml | 2 + src/obicompactvec/src/bitmatrix.rs | 80 +++++++++++++++ src/obicompactvec/src/intmatrix.rs | 125 +++++++++++++++++++++++ src/obicompactvec/src/tests/bitmatrix.rs | 115 +++++++++++++++++++++ src/obicompactvec/src/tests/intmatrix.rs | 124 ++++++++++++++++++++++ 6 files changed, 488 insertions(+) diff --git a/src/Cargo.lock b/src/Cargo.lock index e428b22..68a2b0d 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1460,6 +1460,16 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "mem_dbg" version = "0.3.4" @@ -1546,6 +1556,21 @@ version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "729eb334247daa1803e0a094d0a5c55711b85571179f5ec6e53eccfdf7008958" +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "niffler" version = "2.7.0" @@ -1646,6 +1671,15 @@ dependencies = [ "itoa", ] +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1660,6 +1694,8 @@ name = "obicompactvec" version = "0.1.0" dependencies = [ "memmap2", + "ndarray", + "rayon", "tempfile", ] @@ -2226,6 +2262,12 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" version = "1.12.0" diff --git a/src/obicompactvec/Cargo.toml b/src/obicompactvec/Cargo.toml index 1d65aa9..dc4527a 100644 --- a/src/obicompactvec/Cargo.toml +++ b/src/obicompactvec/Cargo.toml @@ -5,6 +5,8 @@ edition = "2024" [dependencies] memmap2 = "0.9" +ndarray = "0.16" +rayon = "1" [dev-dependencies] tempfile = "3" diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index e646ba8..7a10219 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -1,5 +1,8 @@ use std::{fs, io, path::{Path, PathBuf}}; +use ndarray::{Array2}; +use rayon::prelude::*; + use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; use crate::meta::MatrixMeta; @@ -28,8 +31,85 @@ impl PersistentBitMatrix { pub fn row(&self, slot: usize) -> Box<[bool]> { self.cols.iter().map(|c| c.get(slot)).collect() } + + // ── Distance matrices ───────────────────────────────────────────────────── + + pub fn jaccard_dist_matrix(&self) -> Array2 { + self.pairwise_f64(|i, j| self.col(i).jaccard_dist(self.col(j))) + } + + pub fn hamming_dist_matrix(&self) -> Array2 { + self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) + } + + // ── Partial matrices (additively decomposable across layers) ────────────── + + /// Returns `(inter[n×n], union[n×n])`. + /// Reduce across layers by element-wise addition before computing `1 - inter/union`. + pub fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { + let n = self.n_cols(); + let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| { + let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j)); + (i, j, inter, union) + }) + .collect(); + + let mut inter_m = Array2::zeros((n, n)); + let mut union_m = Array2::zeros((n, n)); + for (i, j, inter, union) in results { + inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; + union_m[[i, j]] = union; union_m[[j, i]] = union; + } + (inter_m, union_m) + } + + /// Returns `hamming[n×n]` (number of differing bits per pair). + /// Additive across layers — reduce by element-wise addition. + pub fn partial_hamming_dist_matrix(&self) -> Array2 { + self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) + } + + // ── Private helpers ─────────────────────────────────────────────────────── + + fn pairwise_f64(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { + let n = self.n_cols(); + let results: Vec<(usize, usize, f64)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| (i, j, f(i, j))) + .collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + } + + fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { + let n = self.n_cols(); + let results: Vec<(usize, usize, u64)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| (i, j, f(i, j))) + .collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + } } +fn upper_pairs(n: usize) -> Vec<(usize, usize)> { + (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() +} + +fn fill_symmetric(n: usize, vals: impl Iterator) -> Array2 +where + T: Clone + Default, +{ + let mut m = Array2::from_elem((n, n), T::default()); + for (i, j, vij, vji) in vals { + m[[i, j]] = vij; + m[[j, i]] = vji; + } + m +} + +// ── Builder ─────────────────────────────────────────────────────────────────── + pub struct PersistentBitMatrixBuilder { dir: PathBuf, n: usize, diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index f86e1a2..5cadd19 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,5 +1,8 @@ use std::{fs, io, path::{Path, PathBuf}}; +use ndarray::{Array1, Array2}; +use rayon::prelude::*; + use crate::builder::PersistentCompactIntVecBuilder; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; @@ -29,8 +32,130 @@ impl PersistentCompactIntMatrix { pub fn row(&self, slot: usize) -> Box<[u32]> { self.cols.iter().map(|c| c.get(slot)).collect() } + + // ── Distance matrices ───────────────────────────────────────────────────── + + pub fn bray_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).bray_dist(self.col(j))) + } + + pub fn relfreq_bray_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j))) + } + + pub fn euclidean_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j))) + } + + pub fn relfreq_euclidean_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j))) + } + + pub fn hellinger_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j))) + } + + pub fn jaccard_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j))) + } + + pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 { + self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold)) + } + + // ── Partial matrices (additively decomposable across layers) ────────────── + + /// Returns `(sum_min[n×n], col_sums[n])`. + /// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]). + /// `col_sums[k]` = Σ_slot col_k[slot]. + /// Reduce across layers by element-wise addition before computing the final distance. + pub fn partial_bray_dist_matrix(&self) -> (Array2, Array1) { + let n = self.n_cols(); + + let col_sums: Vec = (0..n) + .into_par_iter() + .map(|i| self.col(i).sum()) + .collect(); + + let sum_min = self.pairwise_u64(|i, j| { + self.col(i).partial_bray_dist(self.col(j)).0 + }); + + (sum_min, Array1::from_vec(col_sums)) + } + + /// Returns sum of squared differences `[n×n]`. + /// Reduce across layers by element-wise addition, then take `sqrt` for the final distance. + pub fn partial_euclidean_dist_matrix(&self) -> Array2 { + self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j))) + } + + /// Returns `(inter[n×n], union[n×n])` for threshold-Jaccard. + /// Reduce across layers by element-wise addition before computing `1 - inter/union`. + pub fn partial_threshold_jaccard_dist_matrix( + &self, + threshold: u32, + ) -> (Array2, Array2) { + let n = self.n_cols(); + let pairs = upper_pairs(n); + + let results: Vec<(usize, usize, u64, u64)> = pairs + .into_par_iter() + .map(|(i, j)| { + let (inter, union) = + self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold); + (i, j, inter, union) + }) + .collect(); + + let mut inter_m = Array2::zeros((n, n)); + let mut union_m = Array2::zeros((n, n)); + for (i, j, inter, union) in results { + inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; + union_m[[i, j]] = union; union_m[[j, i]] = union; + } + (inter_m, union_m) + } + + // ── Private helpers ─────────────────────────────────────────────────────── + + fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { + let n = self.n_cols(); + let results: Vec<(usize, usize, f64)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| (i, j, f(i, j))) + .collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + } + + fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { + let n = self.n_cols(); + let results: Vec<(usize, usize, u64)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| (i, j, f(i, j))) + .collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + } } +fn upper_pairs(n: usize) -> Vec<(usize, usize)> { + (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() +} + +fn fill_symmetric(n: usize, vals: impl Iterator) -> Array2 +where + T: Clone + Default, +{ + let mut m = Array2::from_elem((n, n), T::default()); + for (i, j, vij, vji) in vals { + m[[i, j]] = vij; + m[[j, i]] = vji; + } + m +} + +// ── Builder ─────────────────────────────────────────────────────────────────── + pub struct PersistentCompactIntMatrixBuilder { dir: PathBuf, n: usize, diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index 1fab8ff..bfb6e34 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -2,6 +2,22 @@ use tempfile::tempdir; use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; +fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentBitMatrixBuilder::new(n, dir.path()).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { + cb.set(slot, v); + } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + (dir, m) +} + #[test] fn single_col_roundtrip() { let dir = tempdir().unwrap(); @@ -67,3 +83,102 @@ fn zero_cols_roundtrip() { assert_eq!(m.n_cols(), 0); assert_eq!(m.n(), 8); } + +// ── Distance matrix tests ───────────────────────────────────────────────────── + +#[test] +fn jaccard_dist_matrix_symmetry_and_diagonal() { + let (_d, m) = make_matrix(&[ + &[true, false, true], + &[true, true, false], + &[false, true, true], + ]); + let dm = m.jaccard_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { assert_eq!(dm[[i, i]], 0.0, "diagonal"); } + for i in 0..n { + for j in 0..n { + assert!((dm[[i, j]] - dm[[j, i]]).abs() < 1e-12, "symmetry [{i},{j}]"); + } + } +} + +#[test] +fn jaccard_dist_matrix_values_match_pairwise() { + let (_d, m) = make_matrix(&[ + &[true, false, true], + &[true, true, false], + &[false, true, true], + ]); + let dm = m.jaccard_dist_matrix(); + for i in 0..m.n_cols() { + for j in 0..m.n_cols() { + let expected = m.col(i).jaccard_dist(m.col(j)); + assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn hamming_dist_matrix_symmetry_and_diagonal() { + let (_d, m) = make_matrix(&[ + &[true, false, true], + &[true, true, false], + &[false, true, true], + ]); + let dm = m.hamming_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { assert_eq!(dm[[i, i]], 0, "diagonal"); } + for i in 0..n { + for j in 0..n { + assert_eq!(dm[[i, j]], dm[[j, i]], "symmetry [{i},{j}]"); + } + } +} + +#[test] +fn hamming_dist_matrix_values_match_pairwise() { + let (_d, m) = make_matrix(&[ + &[true, false, true], + &[true, true, false], + &[false, true, true], + ]); + let dm = m.hamming_dist_matrix(); + for i in 0..m.n_cols() { + for j in 0..m.n_cols() { + assert_eq!(dm[[i, j]], m.col(i).hamming_dist(m.col(j)), "[{i},{j}]"); + } + } +} + +#[test] +fn partial_jaccard_consistent() { + let (_d, m) = make_matrix(&[ + &[true, false, true, true], + &[true, true, false, true], + &[false, true, true, false], + ]); + let (inter, union) = m.partial_jaccard_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { + for j in i + 1..n { + let (ei, eu) = m.col(i).partial_jaccard_dist(m.col(j)); + assert_eq!(inter[[i, j]], ei, "inter [{i},{j}]"); + assert_eq!(union[[i, j]], eu, "union [{i},{j}]"); + assert_eq!(inter[[j, i]], ei, "inter symmetry"); + assert_eq!(union[[j, i]], eu, "union symmetry"); + } + } +} + +#[test] +fn partial_hamming_matches_hamming() { + let (_d, m) = make_matrix(&[ + &[true, false, true], + &[false, true, true], + &[true, true, false], + ]); + let partial = m.partial_hamming_dist_matrix(); + let full = m.hamming_dist_matrix(); + assert_eq!(partial, full); +} diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs index 8af7d6e..ced72c0 100644 --- a/src/obicompactvec/src/tests/intmatrix.rs +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -2,6 +2,22 @@ use tempfile::tempdir; use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; +fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(n, dir.path()).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { + cb.set(slot, v); + } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + (dir, m) +} + #[test] fn single_col_roundtrip() { let dir = tempdir().unwrap(); @@ -66,3 +82,111 @@ fn zero_cols_roundtrip() { assert_eq!(m.n_cols(), 0); assert_eq!(m.n(), 10); } + +// ── Distance matrix tests ───────────────────────────────────────────────────── + +#[test] +fn bray_dist_matrix_symmetry_and_diagonal() { + // col0=[1,0,1], col1=[1,1,0], col2=[0,1,1] + let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]); + let dm = m.bray_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { assert_eq!(dm[[i, i]], 0.0, "diagonal"); } + for i in 0..n { + for j in 0..n { + assert!((dm[[i, j]] - dm[[j, i]]).abs() < 1e-12, "symmetry"); + } + } +} + +#[test] +fn bray_dist_matrix_values_match_pairwise() { + let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]); + let dm = m.bray_dist_matrix(); + for i in 0..m.n_cols() { + for j in 0..m.n_cols() { + let expected = m.col(i).bray_dist(m.col(j)); + assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn jaccard_dist_matrix_values_match_pairwise() { + let (_d, m) = make_matrix(&[&[1, 0, 2], &[0, 1, 1], &[1, 1, 0]]); + let dm = m.jaccard_dist_matrix(); + for i in 0..m.n_cols() { + for j in 0..m.n_cols() { + let expected = m.col(i).jaccard_dist(m.col(j)); + assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn partial_bray_dist_matrix_consistent() { + let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]); + let (sum_min, col_sums) = m.partial_bray_dist_matrix(); + let n = m.n_cols(); + + // symmetry of sum_min + for i in 0..n { + for j in 0..n { + assert_eq!(sum_min[[i, j]], sum_min[[j, i]]); + } + } + + // col_sums correct + for k in 0..n { + assert_eq!(col_sums[k], m.col(k).sum()); + } + + // reconstruct distance from partials and compare to direct method + for i in 0..n { + for j in i + 1..n { + let denom = col_sums[i] + col_sums[j]; + let dist = if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 }; + let expected = m.col(i).bray_dist(m.col(j)); + assert!((dist - expected).abs() < 1e-12, "[{i},{j}]"); + } + } +} + +#[test] +fn partial_euclidean_dist_matrix_consistent() { + let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]); + let sq = m.partial_euclidean_dist_matrix(); + let n = m.n_cols(); + for i in 0..n { + for j in i + 1..n { + let expected = m.col(i).partial_euclidean_dist(m.col(j)); + assert!((sq[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]"); + assert!((sq[[j, i]] - expected).abs() < 1e-12, "symmetry [{j},{i}]"); + } + } +} + +#[test] +fn partial_threshold_jaccard_consistent() { + let (_d, m) = make_matrix(&[&[3, 0, 2], &[1, 2, 0], &[0, 3, 1]]); + let threshold = 2u32; + let (inter, union) = m.partial_threshold_jaccard_dist_matrix(threshold); + let n = m.n_cols(); + for i in 0..n { + for j in i + 1..n { + let (ei, eu) = m.col(i).partial_threshold_jaccard_dist(m.col(j), threshold); + assert_eq!(inter[[i, j]], ei); + assert_eq!(union[[i, j]], eu); + assert_eq!(inter[[j, i]], ei, "symmetry inter"); + assert_eq!(union[[j, i]], eu, "symmetry union"); + } + } +} + +#[test] +fn single_col_distance_matrix_is_zero() { + let (_d, m) = make_matrix(&[&[1, 2, 3]]); + let dm = m.bray_dist_matrix(); + assert_eq!(dm.shape(), &[1, 1]); + assert_eq!(dm[[0, 0]], 0.0); +}