docs: document k-mer index architecture and refactor distance metrics

Add comprehensive documentation for the `obilayeredmap` crate, `PersistentCompactIntVec`, `PersistentBitVec`, and the hierarchical k-mer index architecture, including sidebar navigation updates across all documentation pages. Refactor the Bray-Curtis distance computation in `obicompactvec` to decouple numerator and denominator calculations, replacing direct pairwise calls with explicit loops over precomputed sums. Update tests to verify column sum accuracy and align with the simplified API.
This commit is contained in:
Eric Coissac
2026-05-15 21:07:23 +08:00
parent 8409c852ef
commit 45d49ed501
25 changed files with 8842 additions and 117 deletions
+19 -18
View File
@@ -36,7 +36,20 @@ impl PersistentCompactIntMatrix {
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).bray_dist(self.col(j)))
let sum_min = self.partial_bray_dist_matrix();
let col_sums = self.sum();
let n = self.n_cols();
let mut m = Array2::zeros((n, n));
for i in 0..n {
for j in 0..n {
if i != j {
let denom = col_sums[i] + col_sums[j];
m[[i, j]] = if denom == 0 { 0.0 }
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 };
}
}
}
m
}
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
@@ -74,23 +87,11 @@ impl PersistentCompactIntMatrix {
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
/// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// `col_sums[k]` = Σ_slot col_k[slot].
/// Reduce across layers by element-wise addition before computing the final distance.
pub fn partial_bray_dist_matrix(&self) -> (Array2<u64>, Array1<u64>) {
let n = self.n_cols();
let col_sums: Vec<u64> = (0..n)
.into_par_iter()
.map(|i| self.col(i).sum())
.collect();
let sum_min = self.pairwise_u64(|i, j| {
self.col(i).partial_bray_dist(self.col(j)).0
});
(sum_min, Array1::from_vec(col_sums))
/// Returns `sum_min[n×n]` where `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// The denominator `col_sums[i] + col_sums[j]` is obtained from `self.sum()`.
/// Additive across layers by element-wise addition.
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
}
/// Returns sum of squared differences `[n×n]`.
+8 -18
View File
@@ -141,32 +141,22 @@ impl PersistentCompactIntVec {
#[inline]
/// Returns the Bray-Curtis distance between two compact int vectors.
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
let (sum_min, denom) = self.partial_bray_dist(other);
let sum_min = self.partial_bray_dist(other);
let denom = self.sum() + other.sum();
if denom == 0 {
return 0.0;
}
1.0 - 2.0 * sum_min as f64 / denom as f64
}
/// Returns the partial Bray-Curtis distance between two compact int vectors.
///
/// Returns a tuple `(sum_min, denom)` where `sum_min` is the sum of the minimum values
/// at each index, and `denom` is the sum of the values in both vectors.
/// This is used internally by [`bray_dist`] and to easily compute the Bray-Curtis distance
/// over a set of vector pairs.
///
/// Returns the tuple `(sum_min, sum_a + sum_b)` where `sum_min` is the sum of the minimum
/// values at each index, `sum_a` is the sum of the first vector's counts, and `sum_b` is
/// the sum of the second vector's counts.
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> (u64, u64) {
/// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
/// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
assert_eq!(self.n, other.len(), "length mismatch");
let (sum_min, sum_a, sum_b) = self
.iter()
self.iter()
.zip(other.iter())
.fold((0u64, 0u64, 0u64), |(sm, sa, sb), (a, b)| {
(sm + a.min(b) as u64, sa + a as u64, sb + b as u64)
});
(sum_min, sum_a + sum_b)
.map(|(a, b)| a.min(b) as u64)
.sum()
}
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
+3 -7
View File
@@ -126,21 +126,17 @@ fn jaccard_dist_matrix_values_match_pairwise() {
#[test]
fn partial_bray_dist_matrix_consistent() {
let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]);
let (sum_min, col_sums) = m.partial_bray_dist_matrix();
let sum_min = m.partial_bray_dist_matrix();
let col_sums = m.sum();
let n = m.n_cols();
// symmetry of sum_min
// symmetry
for i in 0..n {
for j in 0..n {
assert_eq!(sum_min[[i, j]], sum_min[[j, i]]);
}
}
// col_sums correct
for k in 0..n {
assert_eq!(col_sums[k], m.col(k).sum());
}
// reconstruct distance from partials and compare to direct method
for i in 0..n {
for j in i + 1..n {