feat: introduce trait-based distance aggregation and layered store
Introduces ColumnWeights, CountPartials, and BitPartials traits to compute and finalize partial distance matrices. Implements these traits for PersistentBitMatrix, PersistentCompactIntMatrix, and a new LayeredStore<S> wrapper that aggregates metrics across layers via parallel reduction. Adds ndarray for numerical aggregation and updates architecture documentation to reflect the trait-driven design and pending refactoring roadmap.
This commit is contained in:
Generated
+1
@@ -1788,6 +1788,7 @@ dependencies = [
|
||||
"cacheline-ef",
|
||||
"epserde 0.8.0",
|
||||
"memmap2",
|
||||
"ndarray",
|
||||
"obicompactvec",
|
||||
"obikseq",
|
||||
"obiskio",
|
||||
|
||||
@@ -117,6 +117,23 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{BitPartials, ColumnWeights};
|
||||
|
||||
impl ColumnWeights for PersistentBitMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.count_ones() }
|
||||
}
|
||||
|
||||
impl BitPartials for PersistentBitMatrix {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_jaccard_dist_matrix()
|
||||
}
|
||||
fn partial_hamming(&self) -> Array2<u64> {
|
||||
self.partial_hamming_dist_matrix()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitMatrixBuilder {
|
||||
|
||||
@@ -203,6 +203,35 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{ColumnWeights, CountPartials};
|
||||
|
||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.partial_bray_dist_matrix()
|
||||
}
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.partial_euclidean_dist_matrix()
|
||||
}
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_threshold_jaccard_dist_matrix(threshold)
|
||||
}
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_bray_dist_matrix(global)
|
||||
}
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_euclidean_dist_matrix(global)
|
||||
}
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_hellinger_euclidean_dist_matrix(global)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentCompactIntMatrixBuilder {
|
||||
|
||||
@@ -5,12 +5,14 @@ mod format;
|
||||
mod intmatrix;
|
||||
mod meta;
|
||||
mod reader;
|
||||
pub mod traits;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/mod.rs"]
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
pub trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
}
|
||||
|
||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||
///
|
||||
/// Every `partial_*` method returns an additive component: element-wise summing the results
|
||||
/// across layers then across partitions yields the global partial, from which the final
|
||||
/// distance is computed via the corresponding provided method.
|
||||
///
|
||||
/// Normalised methods (`partial_relfreq_*`, `partial_hellinger`) require the **global**
|
||||
/// `col_weights` (summed across all layers and partitions) as a parameter. The provided
|
||||
/// finalisation methods compute this in a first pass via `self.col_weights()`.
|
||||
pub trait CountPartials: ColumnWeights {
|
||||
fn partial_bray(&self) -> Array2<u64>;
|
||||
fn partial_euclidean(&self) -> Array2<f64>;
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let sum_min = self.partial_bray();
|
||||
let w = self.col_weights();
|
||||
let n = w.len();
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let d = w[i] + w[j];
|
||||
m[[i, j]] = if d == 0 { 0.0 }
|
||||
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / d as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.partial_euclidean().mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_threshold_jaccard(threshold);
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let mut m = self.partial_relfreq_bray(&global).mapv(|v| 1.0 - v);
|
||||
let n = m.shape()[0];
|
||||
for i in 0..n { m[[i, i]] = 0.0; }
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
self.partial_relfreq_euclidean(&global).mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let sq2 = std::f64::consts::SQRT_2;
|
||||
self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
|
||||
///
|
||||
/// Both `partial_*` methods are additively decomposable across layers and partitions.
|
||||
pub trait BitPartials: ColumnWeights {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_hamming(&self) -> Array2<u64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_jaccard();
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
self.partial_hamming()
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,7 @@ ptr_hash = "1.1"
|
||||
cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
rayon = "1"
|
||||
ndarray = "0.16"
|
||||
memmap2 = "0.9"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -0,0 +1,257 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use obicompactvec::traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
|
||||
/// A store that aggregates a `Vec<S>` — one entry per layer (within a partition)
|
||||
/// or one entry per partition.
|
||||
///
|
||||
/// Blanket impls of `ColumnWeights`, `CountPartials`, and `BitPartials` propagate
|
||||
/// automatically: `LayeredStore<LayeredStore<S>>` implements the same traits as
|
||||
/// `LayeredStore<S>`, giving the partitioned level for free.
|
||||
pub struct LayeredStore<S>(pub Vec<S>);
|
||||
|
||||
impl<S> LayeredStore<S> {
|
||||
pub fn new(layers: Vec<S>) -> Self { Self(layers) }
|
||||
pub fn layers(&self) -> &[S] { &self.0 }
|
||||
pub fn n_layers(&self) -> usize { self.0.len() }
|
||||
pub fn is_empty(&self) -> bool { self.0.is_empty() }
|
||||
}
|
||||
|
||||
// ── ColumnWeights ─────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> {
|
||||
fn col_weights(&self) -> Array1<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.col_weights())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap_or_else(|| Array1::zeros(0))
|
||||
}
|
||||
}
|
||||
|
||||
// ── CountPartials ─────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: CountPartials> CountPartials for LayeredStore<S> {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_bray())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_euclidean())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_threshold_jaccard(threshold))
|
||||
.reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_relfreq_bray(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_relfreq_euclidean(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_hellinger(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// ── BitPartials ───────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: BitPartials> BitPartials for LayeredStore<S> {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_jaccard())
|
||||
.reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_hamming(&self) -> Array2<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_hamming())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(n, dir.path()).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentBitMatrixBuilder::new(n, dir.path()).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn col_weights_sums_across_layers() {
|
||||
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
||||
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
||||
// combined: [13, 17]
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
let w = store.col_weights();
|
||||
assert_eq!(w[0], 13);
|
||||
assert_eq!(w[1], 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_weights_bit_sums_across_layers() {
|
||||
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
||||
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
||||
// combined: [3, 4]
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
let w = store.col_weights();
|
||||
assert_eq!(w[0], 3);
|
||||
assert_eq!(w[1], 4);
|
||||
}
|
||||
|
||||
// ── CountPartials — layered (one partition) ───────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn layered_bray_matches_combined() {
|
||||
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
||||
// on [1,2,3,4,5] for each column pair.
|
||||
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
// direct on full data
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||
let got = CountPartials::bray_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
||||
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_relfreq_bray_matches_combined() {
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
||||
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_euclidean_matches_combined() {
|
||||
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
||||
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
||||
let got = CountPartials::euclidean_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
||||
}
|
||||
|
||||
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
||||
|
||||
#[test]
|
||||
fn partitioned_bray_matches_combined() {
|
||||
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
||||
// partition 1: slots [10,20] col0 vs col1
|
||||
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
||||
|
||||
let partitioned = LayeredStore::new(vec![
|
||||
LayeredStore::new(vec![p0]),
|
||||
LayeredStore::new(vec![p1]),
|
||||
]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||
let got = CountPartials::bray_dist_matrix(&partitioned);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
||||
}
|
||||
|
||||
// ── BitPartials ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn layered_jaccard_matches_combined() {
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_bit_matrix(&[
|
||||
&[true, false, true, true],
|
||||
&[false, true, true, false],
|
||||
]);
|
||||
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
||||
let got = BitPartials::jaccard_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_hamming_matches_combined() {
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_bit_matrix(&[
|
||||
&[true, false, true, true],
|
||||
&[false, true, false, false],
|
||||
]);
|
||||
let expected = BitPartials::hamming_dist_matrix(&mf);
|
||||
let got = BitPartials::hamming_dist_matrix(&store);
|
||||
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,11 @@
|
||||
pub mod error;
|
||||
pub mod evidence;
|
||||
pub mod layer;
|
||||
pub mod layered_store;
|
||||
pub mod map;
|
||||
pub mod meta;
|
||||
|
||||
pub use error::{OLMError, OLMResult};
|
||||
pub use layer::{Hit, Layer, LayerData};
|
||||
pub use layered_store::LayeredStore;
|
||||
pub use map::LayeredMap;
|
||||
|
||||
Reference in New Issue
Block a user