feat: add parallel distance matrix computation for bit and int matrices

Introduce parallel distance matrix generation using `ndarray` and `rayon` for both `BitMatrix` and `IntMatrix`. Adds full and additive-partial variants for Jaccard, Hamming, Bray-Curtis, Euclidean, and Hellinger metrics. Includes comprehensive unit tests verifying matrix symmetry, zero diagonals, and numerical correctness against pairwise calculations.
This commit is contained in:
Eric Coissac
2026-05-15 17:18:02 +08:00
parent 1881e98bad
commit 8bee9f3017
6 changed files with 488 additions and 0 deletions
+42
View File
@@ -1460,6 +1460,16 @@ dependencies = [
"regex-automata",
]
[[package]]
name = "matrixmultiply"
version = "0.3.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08"
dependencies = [
"autocfg",
"rawpointer",
]
[[package]]
name = "mem_dbg"
version = "0.3.4"
@@ -1546,6 +1556,21 @@ version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "729eb334247daa1803e0a094d0a5c55711b85571179f5ec6e53eccfdf7008958"
[[package]]
name = "ndarray"
version = "0.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841"
dependencies = [
"matrixmultiply",
"num-complex",
"num-integer",
"num-traits",
"portable-atomic",
"portable-atomic-util",
"rawpointer",
]
[[package]]
name = "niffler"
version = "2.7.0"
@@ -1646,6 +1671,15 @@ dependencies = [
"itoa",
]
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
@@ -1660,6 +1694,8 @@ name = "obicompactvec"
version = "0.1.0"
dependencies = [
"memmap2",
"ndarray",
"rayon",
"tempfile",
]
@@ -2226,6 +2262,12 @@ dependencies = [
"getrandom 0.3.4",
]
[[package]]
name = "rawpointer"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3"
[[package]]
name = "rayon"
version = "1.12.0"
+2
View File
@@ -5,6 +5,8 @@ edition = "2024"
[dependencies]
memmap2 = "0.9"
ndarray = "0.16"
rayon = "1"
[dev-dependencies]
tempfile = "3"
+80
View File
@@ -1,5 +1,8 @@
use std::{fs, io, path::{Path, PathBuf}};
use ndarray::{Array2};
use rayon::prelude::*;
use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
use crate::meta::MatrixMeta;
@@ -28,8 +31,85 @@ impl PersistentBitMatrix {
pub fn row(&self, slot: usize) -> Box<[bool]> {
self.cols.iter().map(|c| c.get(slot)).collect()
}
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
self.pairwise_f64(|i, j| self.col(i).jaccard_dist(self.col(j)))
}
pub fn hamming_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
}
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(inter[n×n], union[n×n])`.
/// Reduce across layers by element-wise addition before computing `1 - inter/union`.
pub fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
let n = self.n_cols();
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| {
let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j));
(i, j, inter, union)
})
.collect();
let mut inter_m = Array2::zeros((n, n));
let mut union_m = Array2::zeros((n, n));
for (i, j, inter, union) in results {
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
union_m[[i, j]] = union; union_m[[j, i]] = union;
}
(inter_m, union_m)
}
/// Returns `hamming[n×n]` (number of differing bits per pair).
/// Additive across layers — reduce by element-wise addition.
pub fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
}
// ── Private helpers ───────────────────────────────────────────────────────
fn pairwise_f64(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
let n = self.n_cols();
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
let n = self.n_cols();
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
}
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
}
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
where
T: Clone + Default,
{
let mut m = Array2::from_elem((n, n), T::default());
for (i, j, vij, vji) in vals {
m[[i, j]] = vij;
m[[j, i]] = vji;
}
m
}
// ── Builder ───────────────────────────────────────────────────────────────────
pub struct PersistentBitMatrixBuilder {
dir: PathBuf,
n: usize,
+125
View File
@@ -1,5 +1,8 @@
use std::{fs, io, path::{Path, PathBuf}};
use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::builder::PersistentCompactIntVecBuilder;
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -29,8 +32,130 @@ impl PersistentCompactIntMatrix {
pub fn row(&self, slot: usize) -> Box<[u32]> {
self.cols.iter().map(|c| c.get(slot)).collect()
}
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).bray_dist(self.col(j)))
}
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).relfreq_bray_dist(self.col(j)))
}
pub fn euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).euclidean_dist(self.col(j)))
}
pub fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).relfreq_euclidean_dist(self.col(j)))
}
pub fn hellinger_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).hellinger_dist(self.col(j)))
}
pub fn jaccard_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).jaccard_dist(self.col(j)))
}
pub fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).threshold_jaccard_dist(self.col(j), threshold))
}
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
/// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// `col_sums[k]` = Σ_slot col_k[slot].
/// Reduce across layers by element-wise addition before computing the final distance.
pub fn partial_bray_dist_matrix(&self) -> (Array2<u64>, Array1<u64>) {
let n = self.n_cols();
let col_sums: Vec<u64> = (0..n)
.into_par_iter()
.map(|i| self.col(i).sum())
.collect();
let sum_min = self.pairwise_u64(|i, j| {
self.col(i).partial_bray_dist(self.col(j)).0
});
(sum_min, Array1::from_vec(col_sums))
}
/// Returns sum of squared differences `[n×n]`.
/// Reduce across layers by element-wise addition, then take `sqrt` for the final distance.
pub fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
}
/// Returns `(inter[n×n], union[n×n])` for threshold-Jaccard.
/// Reduce across layers by element-wise addition before computing `1 - inter/union`.
pub fn partial_threshold_jaccard_dist_matrix(
&self,
threshold: u32,
) -> (Array2<u64>, Array2<u64>) {
let n = self.n_cols();
let pairs = upper_pairs(n);
let results: Vec<(usize, usize, u64, u64)> = pairs
.into_par_iter()
.map(|(i, j)| {
let (inter, union) =
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
(i, j, inter, union)
})
.collect();
let mut inter_m = Array2::zeros((n, n));
let mut union_m = Array2::zeros((n, n));
for (i, j, inter, union) in results {
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
union_m[[i, j]] = union; union_m[[j, i]] = union;
}
(inter_m, union_m)
}
// ── Private helpers ───────────────────────────────────────────────────────
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
let n = self.n_cols();
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
let n = self.n_cols();
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| (i, j, f(i, j)))
.collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
}
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
}
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
where
T: Clone + Default,
{
let mut m = Array2::from_elem((n, n), T::default());
for (i, j, vij, vji) in vals {
m[[i, j]] = vij;
m[[j, i]] = vji;
}
m
}
// ── Builder ───────────────────────────────────────────────────────────────────
pub struct PersistentCompactIntMatrixBuilder {
dir: PathBuf,
n: usize,
+115
View File
@@ -2,6 +2,22 @@ use tempfile::tempdir;
use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
let n = cols.first().map_or(0, |c| c.len());
let dir = tempdir().unwrap();
let mut b = PersistentBitMatrixBuilder::new(n, dir.path()).unwrap();
for &col in cols {
let mut cb = b.add_col().unwrap();
for (slot, &v) in col.iter().enumerate() {
cb.set(slot, v);
}
cb.close().unwrap();
}
b.close().unwrap();
let m = PersistentBitMatrix::open(dir.path()).unwrap();
(dir, m)
}
#[test]
fn single_col_roundtrip() {
let dir = tempdir().unwrap();
@@ -67,3 +83,102 @@ fn zero_cols_roundtrip() {
assert_eq!(m.n_cols(), 0);
assert_eq!(m.n(), 8);
}
// ── Distance matrix tests ─────────────────────────────────────────────────────
#[test]
fn jaccard_dist_matrix_symmetry_and_diagonal() {
let (_d, m) = make_matrix(&[
&[true, false, true],
&[true, true, false],
&[false, true, true],
]);
let dm = m.jaccard_dist_matrix();
let n = m.n_cols();
for i in 0..n { assert_eq!(dm[[i, i]], 0.0, "diagonal"); }
for i in 0..n {
for j in 0..n {
assert!((dm[[i, j]] - dm[[j, i]]).abs() < 1e-12, "symmetry [{i},{j}]");
}
}
}
#[test]
fn jaccard_dist_matrix_values_match_pairwise() {
let (_d, m) = make_matrix(&[
&[true, false, true],
&[true, true, false],
&[false, true, true],
]);
let dm = m.jaccard_dist_matrix();
for i in 0..m.n_cols() {
for j in 0..m.n_cols() {
let expected = m.col(i).jaccard_dist(m.col(j));
assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn hamming_dist_matrix_symmetry_and_diagonal() {
let (_d, m) = make_matrix(&[
&[true, false, true],
&[true, true, false],
&[false, true, true],
]);
let dm = m.hamming_dist_matrix();
let n = m.n_cols();
for i in 0..n { assert_eq!(dm[[i, i]], 0, "diagonal"); }
for i in 0..n {
for j in 0..n {
assert_eq!(dm[[i, j]], dm[[j, i]], "symmetry [{i},{j}]");
}
}
}
#[test]
fn hamming_dist_matrix_values_match_pairwise() {
let (_d, m) = make_matrix(&[
&[true, false, true],
&[true, true, false],
&[false, true, true],
]);
let dm = m.hamming_dist_matrix();
for i in 0..m.n_cols() {
for j in 0..m.n_cols() {
assert_eq!(dm[[i, j]], m.col(i).hamming_dist(m.col(j)), "[{i},{j}]");
}
}
}
#[test]
fn partial_jaccard_consistent() {
let (_d, m) = make_matrix(&[
&[true, false, true, true],
&[true, true, false, true],
&[false, true, true, false],
]);
let (inter, union) = m.partial_jaccard_dist_matrix();
let n = m.n_cols();
for i in 0..n {
for j in i + 1..n {
let (ei, eu) = m.col(i).partial_jaccard_dist(m.col(j));
assert_eq!(inter[[i, j]], ei, "inter [{i},{j}]");
assert_eq!(union[[i, j]], eu, "union [{i},{j}]");
assert_eq!(inter[[j, i]], ei, "inter symmetry");
assert_eq!(union[[j, i]], eu, "union symmetry");
}
}
}
#[test]
fn partial_hamming_matches_hamming() {
let (_d, m) = make_matrix(&[
&[true, false, true],
&[false, true, true],
&[true, true, false],
]);
let partial = m.partial_hamming_dist_matrix();
let full = m.hamming_dist_matrix();
assert_eq!(partial, full);
}
+124
View File
@@ -2,6 +2,22 @@ use tempfile::tempdir;
use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
let n = cols.first().map_or(0, |c| c.len());
let dir = tempdir().unwrap();
let mut b = PersistentCompactIntMatrixBuilder::new(n, dir.path()).unwrap();
for &col in cols {
let mut cb = b.add_col().unwrap();
for (slot, &v) in col.iter().enumerate() {
cb.set(slot, v);
}
cb.close().unwrap();
}
b.close().unwrap();
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
(dir, m)
}
#[test]
fn single_col_roundtrip() {
let dir = tempdir().unwrap();
@@ -66,3 +82,111 @@ fn zero_cols_roundtrip() {
assert_eq!(m.n_cols(), 0);
assert_eq!(m.n(), 10);
}
// ── Distance matrix tests ─────────────────────────────────────────────────────
#[test]
fn bray_dist_matrix_symmetry_and_diagonal() {
// col0=[1,0,1], col1=[1,1,0], col2=[0,1,1]
let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]);
let dm = m.bray_dist_matrix();
let n = m.n_cols();
for i in 0..n { assert_eq!(dm[[i, i]], 0.0, "diagonal"); }
for i in 0..n {
for j in 0..n {
assert!((dm[[i, j]] - dm[[j, i]]).abs() < 1e-12, "symmetry");
}
}
}
#[test]
fn bray_dist_matrix_values_match_pairwise() {
let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]);
let dm = m.bray_dist_matrix();
for i in 0..m.n_cols() {
for j in 0..m.n_cols() {
let expected = m.col(i).bray_dist(m.col(j));
assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn jaccard_dist_matrix_values_match_pairwise() {
let (_d, m) = make_matrix(&[&[1, 0, 2], &[0, 1, 1], &[1, 1, 0]]);
let dm = m.jaccard_dist_matrix();
for i in 0..m.n_cols() {
for j in 0..m.n_cols() {
let expected = m.col(i).jaccard_dist(m.col(j));
assert!((dm[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn partial_bray_dist_matrix_consistent() {
let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]);
let (sum_min, col_sums) = m.partial_bray_dist_matrix();
let n = m.n_cols();
// symmetry of sum_min
for i in 0..n {
for j in 0..n {
assert_eq!(sum_min[[i, j]], sum_min[[j, i]]);
}
}
// col_sums correct
for k in 0..n {
assert_eq!(col_sums[k], m.col(k).sum());
}
// reconstruct distance from partials and compare to direct method
for i in 0..n {
for j in i + 1..n {
let denom = col_sums[i] + col_sums[j];
let dist = if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 };
let expected = m.col(i).bray_dist(m.col(j));
assert!((dist - expected).abs() < 1e-12, "[{i},{j}]");
}
}
}
#[test]
fn partial_euclidean_dist_matrix_consistent() {
let (_d, m) = make_matrix(&[&[3, 0], &[0, 4], &[1, 1]]);
let sq = m.partial_euclidean_dist_matrix();
let n = m.n_cols();
for i in 0..n {
for j in i + 1..n {
let expected = m.col(i).partial_euclidean_dist(m.col(j));
assert!((sq[[i, j]] - expected).abs() < 1e-12, "[{i},{j}]");
assert!((sq[[j, i]] - expected).abs() < 1e-12, "symmetry [{j},{i}]");
}
}
}
#[test]
fn partial_threshold_jaccard_consistent() {
let (_d, m) = make_matrix(&[&[3, 0, 2], &[1, 2, 0], &[0, 3, 1]]);
let threshold = 2u32;
let (inter, union) = m.partial_threshold_jaccard_dist_matrix(threshold);
let n = m.n_cols();
for i in 0..n {
for j in i + 1..n {
let (ei, eu) = m.col(i).partial_threshold_jaccard_dist(m.col(j), threshold);
assert_eq!(inter[[i, j]], ei);
assert_eq!(union[[i, j]], eu);
assert_eq!(inter[[j, i]], ei, "symmetry inter");
assert_eq!(union[[j, i]], eu, "symmetry union");
}
}
}
#[test]
fn single_col_distance_matrix_is_zero() {
let (_d, m) = make_matrix(&[&[1, 2, 3]]);
let dm = m.bray_dist_matrix();
assert_eq!(dm.shape(), &[1, 1]);
assert_eq!(dm[[0, 0]], 0.0);
}