refactor: extract matrix helpers and improve bit iteration ergonomics
Refactor parallel matrix construction by extracting reusable `pairwise_matrix` and `pairwise2_matrix` helpers, and consolidate binary record deserialization into dedicated parsing functions. Add `set` and `iter` methods to `BitSliceMut` and `MemoryBitVec` for ergonomic bit manipulation and iteration. Standardize JSON field extraction via `meta::field`, expose `MemoryBitIter`, and improve test reliability by automatically cleaning up temporary directories.
This commit is contained in:
@@ -8,9 +8,10 @@ use memmap2::Mmap;
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
|
||||
use crate::builder::PersistentCompactIntVecBuilder;
|
||||
use crate::memoryintvec::MemoryIntVec;
|
||||
use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
|
||||
use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, parse_index_entry, parse_overflow_entry};
|
||||
use crate::meta::MatrixMeta;
|
||||
use crate::reader::PersistentCompactIntVec;
|
||||
|
||||
@@ -65,49 +66,35 @@ impl ColumnarCompactIntMatrix {
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(
|
||||
&self, threshold: u32,
|
||||
) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols();
|
||||
let pairs = upper_pairs(n);
|
||||
let results: Vec<(usize, usize, u64, u64)> = pairs
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| {
|
||||
let (inter, union) =
|
||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
|
||||
(i, j, inter, union)
|
||||
})
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
}
|
||||
(inter_m, union_m)
|
||||
pairwise2_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| {
|
||||
pairwise_matrix(self.n_cols(), |i, j| {
|
||||
self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
|
||||
})
|
||||
}
|
||||
@@ -121,19 +108,6 @@ impl ColumnarCompactIntMatrix {
|
||||
meta.save(dir)
|
||||
}
|
||||
|
||||
fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, f64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols();
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
}
|
||||
|
||||
// ── PackedCompactIntMatrix ────────────────────────────────────────────────────
|
||||
@@ -185,10 +159,7 @@ impl PackedCompactIntMatrix {
|
||||
|
||||
let mut index = Vec::with_capacity(n_idx);
|
||||
for i in 0..n_idx {
|
||||
let ioff = index_offset + i * INDEX_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize;
|
||||
let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
|
||||
index.push((slot, pos));
|
||||
index.push(parse_index_entry(&mmap, index_offset, i));
|
||||
}
|
||||
columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
|
||||
}
|
||||
@@ -196,30 +167,25 @@ impl PackedCompactIntMatrix {
|
||||
Ok(Self { mmap, n_rows, n_cols, columns })
|
||||
}
|
||||
|
||||
fn col_overflow_map(&self, ci: &ColInfo) -> HashMap<usize, u32> {
|
||||
let mut overflow = HashMap::with_capacity(ci.n_overflow);
|
||||
for i in 0..ci.n_overflow {
|
||||
let (slot, value) = parse_overflow_entry(&self.mmap, ci.data_offset, i);
|
||||
overflow.insert(slot, value);
|
||||
}
|
||||
overflow
|
||||
}
|
||||
|
||||
pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
|
||||
let ci = &self.columns[c];
|
||||
let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
|
||||
let mut overflow = HashMap::with_capacity(ci.n_overflow);
|
||||
for i in 0..ci.n_overflow {
|
||||
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
||||
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
|
||||
overflow.insert(slot, value);
|
||||
}
|
||||
PersistentCompactIntVecBuilder::from_raw_primary(primary, overflow, path)
|
||||
PersistentCompactIntVecBuilder::from_raw_primary(primary, self.col_overflow_map(ci), path)
|
||||
}
|
||||
|
||||
pub(crate) fn col_as_memory(&self, c: usize) -> MemoryIntVec {
|
||||
let ci = &self.columns[c];
|
||||
let primary = self.mmap[ci.primary_start..ci.primary_start + self.n_rows].to_vec();
|
||||
let mut overflow = HashMap::with_capacity(ci.n_overflow);
|
||||
for i in 0..ci.n_overflow {
|
||||
let off = ci.data_offset + i * OVERFLOW_ENTRY_SIZE;
|
||||
let slot = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
|
||||
let value = u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap());
|
||||
overflow.insert(slot, value);
|
||||
}
|
||||
MemoryIntVec::from_primary_and_overflow(primary, overflow)
|
||||
MemoryIntVec::from_primary_and_overflow(primary, self.col_overflow_map(ci))
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -327,55 +293,28 @@ impl PackedCompactIntMatrix {
|
||||
|
||||
// ── Matrix methods ────────────────────────────────────────────────────────
|
||||
|
||||
fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
|
||||
where T: Clone + Default + Send {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, T)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
|
||||
}
|
||||
|
||||
fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64)> = upper_pairs(n)
|
||||
.into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
|
||||
fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
|
||||
self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
|
||||
}
|
||||
|
||||
|
||||
pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
let n = self.n_cols;
|
||||
let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
|
||||
.into_par_iter()
|
||||
.map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
|
||||
.collect();
|
||||
let mut inter_m = Array2::zeros((n, n));
|
||||
let mut union_m = Array2::zeros((n, n));
|
||||
for (i, j, inter, union) in results {
|
||||
inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
|
||||
union_m[[i, j]] = union; union_m[[j, i]] = union;
|
||||
}
|
||||
(inter_m, union_m)
|
||||
pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
|
||||
self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
|
||||
}
|
||||
|
||||
}
|
||||
@@ -570,15 +509,3 @@ impl PersistentCompactIntMatrixBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
// ── Helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
|
||||
(0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
|
||||
}
|
||||
|
||||
fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
|
||||
where T: Clone + Default {
|
||||
let mut m = Array2::from_elem((n, n), T::default());
|
||||
for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
|
||||
m
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user