feat: introduce trait-based distance aggregation and layered store
Introduces ColumnWeights, CountPartials, and BitPartials traits to compute and finalize partial distance matrices. Implements these traits for PersistentBitMatrix, PersistentCompactIntMatrix, and a new LayeredStore<S> wrapper that aggregates metrics across layers via parallel reduction. Adds ndarray for numerical aggregation and updates architecture documentation to reflect the trait-driven design and pending refactoring roadmap.
This commit is contained in:
@@ -117,6 +117,23 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{BitPartials, ColumnWeights};
|
||||
|
||||
impl ColumnWeights for PersistentBitMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.count_ones() }
|
||||
}
|
||||
|
||||
impl BitPartials for PersistentBitMatrix {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_jaccard_dist_matrix()
|
||||
}
|
||||
fn partial_hamming(&self) -> Array2<u64> {
|
||||
self.partial_hamming_dist_matrix()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitMatrixBuilder {
|
||||
|
||||
@@ -203,6 +203,35 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{ColumnWeights, CountPartials};
|
||||
|
||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.partial_bray_dist_matrix()
|
||||
}
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.partial_euclidean_dist_matrix()
|
||||
}
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_threshold_jaccard_dist_matrix(threshold)
|
||||
}
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_bray_dist_matrix(global)
|
||||
}
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_euclidean_dist_matrix(global)
|
||||
}
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_hellinger_euclidean_dist_matrix(global)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentCompactIntMatrixBuilder {
|
||||
|
||||
@@ -5,12 +5,14 @@ mod format;
|
||||
mod intmatrix;
|
||||
mod meta;
|
||||
mod reader;
|
||||
pub mod traits;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/mod.rs"]
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
pub trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
}
|
||||
|
||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||
///
|
||||
/// Every `partial_*` method returns an additive component: element-wise summing the results
|
||||
/// across layers then across partitions yields the global partial, from which the final
|
||||
/// distance is computed via the corresponding provided method.
|
||||
///
|
||||
/// Normalised methods (`partial_relfreq_*`, `partial_hellinger`) require the **global**
|
||||
/// `col_weights` (summed across all layers and partitions) as a parameter. The provided
|
||||
/// finalisation methods compute this in a first pass via `self.col_weights()`.
|
||||
pub trait CountPartials: ColumnWeights {
|
||||
fn partial_bray(&self) -> Array2<u64>;
|
||||
fn partial_euclidean(&self) -> Array2<f64>;
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let sum_min = self.partial_bray();
|
||||
let w = self.col_weights();
|
||||
let n = w.len();
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let d = w[i] + w[j];
|
||||
m[[i, j]] = if d == 0 { 0.0 }
|
||||
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / d as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.partial_euclidean().mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_threshold_jaccard(threshold);
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let mut m = self.partial_relfreq_bray(&global).mapv(|v| 1.0 - v);
|
||||
let n = m.shape()[0];
|
||||
for i in 0..n { m[[i, i]] = 0.0; }
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
self.partial_relfreq_euclidean(&global).mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let sq2 = std::f64::consts::SQRT_2;
|
||||
self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
|
||||
///
|
||||
/// Both `partial_*` methods are additively decomposable across layers and partitions.
|
||||
pub trait BitPartials: ColumnWeights {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_hamming(&self) -> Array2<u64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_jaccard();
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
self.partial_hamming()
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user