refactor: extract matrix helpers and improve bit iteration ergonomics

Refactor parallel matrix construction by extracting reusable `pairwise_matrix` and `pairwise2_matrix` helpers, and consolidate binary record deserialization into dedicated parsing functions. Add `set` and `iter` methods to `BitSliceMut` and `MemoryBitVec` for ergonomic bit manipulation and iteration. Standardize JSON field extraction via `meta::field`, expose `MemoryBitIter`, and improve test reliability by automatically cleaning up temporary directories.
This commit is contained in:
Eric Coissac
2026-06-16 23:36:25 +02:00
parent cde6457eea
commit d1717688d2
16 changed files with 136 additions and 183 deletions
+28 -101
View File
@@ -8,9 +8,10 @@ use memmap2::Mmap;
use ndarray::{Array1, Array2};
use rayon::prelude::*;
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
use crate::builder::PersistentCompactIntVecBuilder;
use crate::memoryintvec::MemoryIntVec;
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
use crate::meta::MatrixMeta;
use crate::reader::PersistentCompactIntVec;
@@ -65,49 +66,35 @@ impl ColumnarCompactIntMatrix {
}
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
}
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
}
pub(crate) fn partial_threshold_jaccard_dist_matrix(
&self, threshold: u32,
) -> (Array2<u64>, Array2<u64>) {
let n = self.n_cols();
let pairs = upper_pairs(n);
let results: Vec<(usize, usize, u64, u64)> = pairs
.into_par_iter()
.map(|(i, j)| {
let (inter, union) =
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
(i, j, inter, union)
})
.collect();
let mut inter_m = Array2::zeros((n, n));
let mut union_m = Array2::zeros((n, n));
for (i, j, inter, union) in results {
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
union_m[[i, j]] = union; union_m[[j, i]] = union;
}
(inter_m, union_m)
pairwise2_matrix(self.n_cols(), |i, j| {
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
})
}
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
pairwise_matrix(self.n_cols(), |i, j| {
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
})
}
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
pairwise_matrix(self.n_cols(), |i, j| {
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
})
}
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| {
pairwise_matrix(self.n_cols(), |i, j| {
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
})
}
@@ -121,19 +108,6 @@ impl ColumnarCompactIntMatrix {
meta.save(dir)
}
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
let n = self.n_cols();
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
let n = self.n_cols();
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
}
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
@@ -185,10 +159,7 @@ impl PackedCompactIntMatrix {
let mut index = Vec::with_capacity(n_idx);
for i in 0..n_idx {
let ioff = index_offset + i * INDEX_ENTRY_SIZE;
let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize;
let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
index.push((slot, pos));
index.push(parse_index_entry(&mmap, index_offset, i));
}
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
}
@@ -196,30 +167,25 @@ impl PackedCompactIntMatrix {
Ok(Self { mmap, n_rows, n_cols, columns })
}
fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
overflow.insert(slot, value);
}
overflow
}
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
let ci = &self.columns[c];
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
}
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
let ci = &self.columns[c];
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
let mut overflow = HashMap::with_capacity(ci.n_overflow);
for i in 0..ci.n_overflow {
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
overflow.insert(slot, value);
}
MemoryIntVec::from_primary_and_overflow(primary, overflow)
MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
}
#[inline]
@@ -327,55 +293,28 @@ impl PackedCompactIntMatrix {
// ── Matrix methods ────────────────────────────────────────────────────────
fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
where T: Clone + Default + Send {
let n = self.n_cols;
let results: Vec<(usize, usize, T)> = upper_pairs(n)
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
}
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
let n = self.n_cols;
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
}
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
}
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
}
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
let n = self.n_cols;
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
.into_par_iter()
.map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
.collect();
let mut inter_m = Array2::zeros((n, n));
let mut union_m = Array2::zeros((n, n));
for (i, j, inter, union) in results {
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
union_m[[i, j]] = union; union_m[[j, i]] = union;
}
(inter_m, union_m)
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
}
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
}
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
}
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
}
}
@@ -570,15 +509,3 @@ impl PersistentCompactIntMatrixBuilder {
}
}
// ── Helpers ───────────────────────────────────────────────────────────────────
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
}
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
where T: Clone + Default {
let mut m = Array2::from_elem((n, n), T::default());
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
m
}