diff --git a/doc/architecture/index_architecture/index.html b/doc/architecture/index_architecture/index.html index 43ccb55..24d05a5 100644 --- a/doc/architecture/index_architecture/index.html +++ b/doc/architecture/index_architecture/index.html @@ -968,94 +968,33 @@
Aggregation is hierarchical: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.
-PersistentCompactIntMatrix::sum() — column sums for one (partition, layer) matrix
+PersistentCompactIntMatrix::col_weights() — column sums for one (partition, layer) matrix
↓ Σ across layers
-LayeredCompactIntMatrix::sum() — column sums for one partition
+LayeredStore<PersistentCompactIntMatrix>::col_weights() — column sums for one partition
↓ Σ across partitions
-PartitionedCompactIntMatrix::sum() — global column sums
+LayeredStore<LayeredStore<…>>::col_weights() — global column sums
-The same cascade applies to every partial computation:
-PersistentCompactIntMatrix::partial_bray_dist_matrix() — one (partition, layer)
+The same cascade applies to every partial:
+PersistentCompactIntMatrix::partial_bray() — one (partition, layer)
↓ element-wise Σ across layers
-LayeredCompactIntMatrix::partial_bray() — one partition
+LayeredStore<PersistentCompactIntMatrix>::partial_bray() — one partition
↓ element-wise Σ across partitions
-PartitionedCompactIntMatrix::partial_bray() — global partial → final dist
+LayeredStore<LayeredStore<…>>::partial_bray() — global partial → final dist
-This means LayeredCompactIntMatrix never inspects individual PersistentCompactIntVec columns directly, and PartitionedCompactIntMatrix never inspects individual layers. Each level presents a stable API surface to the level above.
+Each level presents a stable trait surface to the level above; no level reaches two levels down.
-LayeredDataStore — aggregation within one partition
-A LayeredDataStore holds one DataStore per layer within a single partition:
-struct LayeredCompactIntMatrix { layers: Vec<PersistentCompactIntMatrix> }
-struct LayeredBitMatrix { layers: Vec<PersistentBitMatrix> }
-
-Column statistics
-// LayeredCompactIntMatrix
-fn sum(&self) -> Array1<u64>
- // = layers.par_iter().map(|m| m.sum()).reduce(element-wise +)
+Traits — obicompactvec::traits
+Three traits unify the aggregation API across all levels of the hierarchy.
+trait ColumnWeights: Send + Sync {
+ fn col_weights(&self) -> Array1<u64>;
+}
-// LayeredBitMatrix
-fn count_ones(&self) -> Array1<u64>
- // = layers.par_iter().map(|m| m.count_ones()).reduce(element-wise +)
-
-Self-contained partials
-Each method reduces across layers by element-wise addition of per-layer matrices:
-fn partial_bray(&self) -> (Array2<u64>, Array1<u64>)
- // Σ_l layer_l.partial_bray_dist_matrix()
+trait CountPartials: ColumnWeights {
+ // self-contained partials (additive, no parameter)
+ fn partial_bray(&self) -> Array2<u64>;
+ fn partial_euclidean(&self) -> Array2<f64>;
+ fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
+ // normalised partials (global col_weights passed in cascade)
+ fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
+ fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
+ fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
+ // provided finalisation methods (default implementations)
+ fn bray_dist_matrix(&self) -> Array2<f64> { … }
+ fn euclidean_dist_matrix(&self) -> Array2<f64> { … }
+ fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> { … }
+ fn relfreq_bray_dist_matrix(&self) -> Array2<f64> { … }
+ fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> { … }
+ fn hellinger_dist_matrix(&self) -> Array2<f64> { … }
+}
-fn partial_euclidean(&self) -> Array2<f64>
- // Σ_l layer_l.partial_euclidean_dist_matrix()
-
-fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>)
- // Σ_l layer_l.partial_jaccard_dist_matrix() [bit matrix]
- // Σ_l layer_l.partial_threshold_jaccard_dist_matrix() [int matrix]
-
-fn partial_hamming(&self) -> Array2<u64>
- // Σ_l layer_l.partial_hamming_dist_matrix() [bit matrix]
-
-Normalised partials (require global sums from above)
-fn partial_relfreq_bray(&self, global_sums: &Array1<u64>) -> Array2<f64>
- // Σ_l layer_l.partial_relfreq_bray_dist_matrix(global_sums)
-
-fn partial_relfreq_euclidean(&self, global_sums: &Array1<u64>) -> Array2<f64>
- // Σ_l layer_l.partial_relfreq_euclidean_dist_matrix(global_sums)
-
-fn partial_hellinger(&self, global_sums: &Array1<u64>) -> Array2<f64>
- // Σ_l layer_l.partial_hellinger_euclidean_dist_matrix(global_sums)
-
-global_sums is provided by the PartitionedDataStore; this level does not compute it.
-
-PartitionedDataStore — aggregation across all partitions
-A PartitionedDataStore holds one LayeredDataStore per partition:
-struct PartitionedCompactIntMatrix { partitions: Vec<LayeredCompactIntMatrix> }
-struct PartitionedBitMatrix { partitions: Vec<LayeredBitMatrix> }
-
-Column statistics
-fn sum(&self) -> Array1<u64>
- // = partitions.par_iter().map(|p| p.sum()).reduce(element-wise +)
-
-p.sum() is itself a reduction across layers (see above) — the cascade is preserved.
-Self-contained metrics — single pass
-fn bray_dist_matrix(&self) -> Array2<f64> {
- let (sum_min, col_sums) = partitions
- .par_iter()
- .map(|p| p.partial_bray())
- .reduce(element-wise +);
- // finalise
- for (i,j): dist[i,j] = 1 - 2·sum_min[i,j] / (col_sums[i] + col_sums[j])
+trait BitPartials: ColumnWeights {
+ fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>);
+ fn partial_hamming(&self) -> Array2<u64>;
+ // provided
+ fn jaccard_dist_matrix(&self) -> Array2<f64> { … }
+ fn hamming_dist_matrix(&self) -> Array2<u64> { … }
}
-Normalised metrics — two passes
-fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
- // pass 1 — progressive: PartitionedDataStore::sum()
- // calls LayeredDataStore::sum() per partition (parallel)
- // calls PersistentCompactIntMatrix::sum() per layer (parallel)
- let global_sums = self.sum();
-
- // pass 2 — per-partition partial using global_sums (parallel)
- let matrix = partitions
- .par_iter()
- .map(|p| p.partial_relfreq_bray(&global_sums))
- .reduce(element-wise +);
- // finalise
- for (i,j): dist[i,j] = 1 - matrix[i,j]
+Leaf implementors (in obicompactvec):
+
+
+
+Type
+Traits
+
+
+
+
+PersistentCompactIntMatrix
+ColumnWeights (via sum()), CountPartials
+
+
+PersistentBitMatrix
+ColumnWeights (via count_ones()), BitPartials
+
+
+
+PersistentCompactIntVec and PersistentBitVec do not implement these traits — they are single-column primitives, not matrix-level aggregators.
+
+LayeredStore<S> — obilayeredmap
+A single generic wrapper replaces the need for named LayeredDataStore and PartitionedDataStore types:
+pub struct LayeredStore<S>(Vec<S>);
+
+Three blanket impls propagate the traits up the hierarchy:
+impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> { … } // Σ across inner stores
+impl<S: CountPartials> CountPartials for LayeredStore<S> { … } // same pattern
+impl<S: BitPartials> BitPartials for LayeredStore<S> { … } // same pattern
+
+Because the blanket impl is recursive, LayeredStore<LayeredStore<S>> automatically inherits all three traits when S does — no separate PartitionedStore type is needed:
+PersistentCompactIntMatrix implements CountPartials
+LayeredStore<PersistentCompactIntMatrix> via blanket impl (= one partition)
+LayeredStore<LayeredStore<…>> via blanket impl (= partitioned index)
+
+Normalised metrics — two-pass cascade
+The normalised finalisation methods call col_weights() first (pass 1), then the normalised partial (pass 2). Both calls go through the same blanket impl, so the cascade is automatic:
+// called on LayeredStore<LayeredStore<PersistentCompactIntMatrix>>
+fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
+ let global = self.col_weights(); // pass 1 — progressive sum at every level
+ let p = self.partial_relfreq_bray(&global); // pass 2 — global passed in cascade
+ p.mapv(|v| 1.0 - v) // finalise (diagonal zeroed separately)
}
-global_sums is exact because each kmer belongs to exactly one (partition, layer) pair — no double-counting. Pass 1 is itself fully parallel at every level of the hierarchy.
+global is exact: each kmer belongs to exactly one (partition, layer) pair, so there is no double-counting across the hierarchy.
Parallelism model
@@ -1687,31 +1642,32 @@ PartitionedCompactIntMatrix::partial_bray() — global partial →
Across partitions
-LayeredDataStore
+LayeredStore<LayeredStore<S>> inner stores
none — fully independent
-Across layers (self-contained)
-(partition, layer) pair
+Across layers within a partition
+LayeredStore<S> inner stores
none — disjoint kmer sets
-Across layers (normalised, pass 1)
-(partition, layer) pair
-none — sums are additive
+Normalised pass 1 (col_weights)
+per inner store
+none — additive
-Across layers (normalised, pass 2)
-(partition, layer) pair
-global_sums broadcast read-only
+Normalised pass 2 (partial)
+per inner store
+global broadcast read-only
-Within a DataStore (distance matrix)
+Within a matrix (distance)
upper-triangle pair (i,j)
-none — rayon par_iter
+none — rayon par_iter
+All levels use rayon par_iter internally; reduce_with performs a parallel tree reduction.
Query model
Point query — kmer → Option<Item>
@@ -1742,19 +1698,24 @@ for (p, l) in all_partition_layer_pairs().par_iter():
Other derivations: threshold a count matrix → binary presence matrix; union two presence matrices; merge two count matrices (saturating add, column-wise). All are local to one (partition, layer) pair.
Relationship to current implementation
-The current obilayeredmap crate implements a subset of this architecture. Key divergences:
+What is implemented
-Layer<D: LayerData> fuses MphfLayer and one DataStore into a single generic type. Multiple data stores on the same MPHF are not supported.
-LayerData::open(dir) embeds the path convention (counts/, presence/) inside the store type, preventing the PartitionedIndex from managing paths externally.
-LayeredDataStore and PartitionedDataStore do not yet exist; LayeredMap is a single-partition structure without a distance matrix API.
-- The partial distance methods exist on
PersistentCompactIntMatrix and PersistentBitMatrix and are tested; they are not yet composed across layers and partitions.
+obicompactvec::traits: ColumnWeights, CountPartials, BitPartials are defined and implemented on PersistentCompactIntMatrix and PersistentBitMatrix.
+obilayeredmap::LayeredStore<S>: generic wrapper with blanket impls for all three traits. LayeredStore<LayeredStore<S>> is the partitioned level — no separate type needed. Tests confirm that splitting data across layers and across partitions gives the same distance matrices as computing on flat combined data.
-Planned refactoring:
-1. Extract MphfLayer from Layer<D> as an autonomous type.
-2. Replace LayerData trait with DataStore trait (no path knowledge).
-3. Implement LayeredCompactIntMatrix / LayeredBitMatrix with the partial + full distance APIs described above.
-4. Implement PartitionedCompactIntMatrix / PartitionedBitMatrix with two-pass support for normalised metrics.
-5. Implement PartitionedIndex for point queries with parallel dispatch.
+What is not yet implemented
+
+Layer<D: LayerData> still fuses MphfLayer and one DataStore. Multiple data stores on the same MPHF are not supported.
+LayeredMap is a single-partition structure without distance matrix API; it does not yet use LayeredStore.
+- No
PartitionedIndex type for point queries with parallel partition dispatch.
+
+Planned refactoring
+
+- Extract
MphfLayer from Layer<D> as an autonomous type.
+- Replace
LayerData trait with the DataStore / ColumnWeights / CountPartials / BitPartials system.
+- Rewire
LayeredMap to hold LayeredStore<PersistentCompactIntMatrix> (or bit variant) alongside the MPHF layers.
+- Implement
PartitionedIndex using LayeredStore<LayeredStore<S>> for data and parallel dispatch for queries.
+
diff --git a/doc/sitemap.xml.gz b/doc/sitemap.xml.gz
index e43d52e..3b8a973 100644
Binary files a/doc/sitemap.xml.gz and b/doc/sitemap.xml.gz differ
diff --git a/docmd/architecture/index_architecture.md b/docmd/architecture/index_architecture.md
index 9db00a2..cbc1083 100644
--- a/docmd/architecture/index_architecture.md
+++ b/docmd/architecture/index_architecture.md
@@ -141,135 +141,112 @@ The `col_sums` parameter must reflect the GLOBAL count across all layers and all
Aggregation is **hierarchical**: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.
```
-PersistentCompactIntMatrix::sum() — column sums for one (partition, layer) matrix
+PersistentCompactIntMatrix::col_weights() — column sums for one (partition, layer) matrix
↓ Σ across layers
-LayeredCompactIntMatrix::sum() — column sums for one partition
+LayeredStore::col_weights() — column sums for one partition
↓ Σ across partitions
-PartitionedCompactIntMatrix::sum() — global column sums
+LayeredStore>::col_weights() — global column sums
```
-The same cascade applies to every partial computation:
+The same cascade applies to every partial:
```
-PersistentCompactIntMatrix::partial_bray_dist_matrix() — one (partition, layer)
+PersistentCompactIntMatrix::partial_bray() — one (partition, layer)
↓ element-wise Σ across layers
-LayeredCompactIntMatrix::partial_bray() — one partition
+LayeredStore::partial_bray() — one partition
↓ element-wise Σ across partitions
-PartitionedCompactIntMatrix::partial_bray() — global partial → final dist
+LayeredStore>::partial_bray() — global partial → final dist
```
-This means `LayeredCompactIntMatrix` never inspects individual `PersistentCompactIntVec` columns directly, and `PartitionedCompactIntMatrix` never inspects individual layers. Each level presents a stable API surface to the level above.
+Each level presents a stable trait surface to the level above; no level reaches two levels down.
---
-## LayeredDataStore — aggregation within one partition
+## Traits — `obicompactvec::traits`
-A `LayeredDataStore` holds one `DataStore` per layer within a single partition:
+Three traits unify the aggregation API across all levels of the hierarchy.
```rust
-struct LayeredCompactIntMatrix { layers: Vec }
-struct LayeredBitMatrix { layers: Vec }
-```
+trait ColumnWeights: Send + Sync {
+ fn col_weights(&self) -> Array1;
+}
-### Column statistics
+trait CountPartials: ColumnWeights {
+ // self-contained partials (additive, no parameter)
+ fn partial_bray(&self) -> Array2;
+ fn partial_euclidean(&self) -> Array2;
+ fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2, Array2);
+ // normalised partials (global col_weights passed in cascade)
+ fn partial_relfreq_bray(&self, global: &Array1) -> Array2;
+ fn partial_relfreq_euclidean(&self, global: &Array1) -> Array2;
+ fn partial_hellinger(&self, global: &Array1) -> Array2;
+ // provided finalisation methods (default implementations)
+ fn bray_dist_matrix(&self) -> Array2 { … }
+ fn euclidean_dist_matrix(&self) -> Array2 { … }
+ fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 { … }
+ fn relfreq_bray_dist_matrix(&self) -> Array2 { … }
+ fn relfreq_euclidean_dist_matrix(&self) -> Array2 { … }
+ fn hellinger_dist_matrix(&self) -> Array2 { … }
+}
-```rust
-// LayeredCompactIntMatrix
-fn sum(&self) -> Array1
- // = layers.par_iter().map(|m| m.sum()).reduce(element-wise +)
-
-// LayeredBitMatrix
-fn count_ones(&self) -> Array1
- // = layers.par_iter().map(|m| m.count_ones()).reduce(element-wise +)
-```
-
-### Self-contained partials
-
-Each method reduces across layers by element-wise addition of per-layer matrices:
-
-```rust
-fn partial_bray(&self) -> (Array2, Array1)
- // Σ_l layer_l.partial_bray_dist_matrix()
-
-fn partial_euclidean(&self) -> Array2
- // Σ_l layer_l.partial_euclidean_dist_matrix()
-
-fn partial_jaccard(&self) -> (Array2, Array2)
- // Σ_l layer_l.partial_jaccard_dist_matrix() [bit matrix]
- // Σ_l layer_l.partial_threshold_jaccard_dist_matrix() [int matrix]
-
-fn partial_hamming(&self) -> Array2
- // Σ_l layer_l.partial_hamming_dist_matrix() [bit matrix]
-```
-
-### Normalised partials (require global sums from above)
-
-```rust
-fn partial_relfreq_bray(&self, global_sums: &Array1) -> Array2
- // Σ_l layer_l.partial_relfreq_bray_dist_matrix(global_sums)
-
-fn partial_relfreq_euclidean(&self, global_sums: &Array1) -> Array2
- // Σ_l layer_l.partial_relfreq_euclidean_dist_matrix(global_sums)
-
-fn partial_hellinger(&self, global_sums: &Array1) -> Array2
- // Σ_l layer_l.partial_hellinger_euclidean_dist_matrix(global_sums)
-```
-
-`global_sums` is provided by the `PartitionedDataStore`; this level does not compute it.
-
----
-
-## PartitionedDataStore — aggregation across all partitions
-
-A `PartitionedDataStore` holds one `LayeredDataStore` per partition:
-
-```rust
-struct PartitionedCompactIntMatrix { partitions: Vec }
-struct PartitionedBitMatrix { partitions: Vec }
-```
-
-### Column statistics
-
-```rust
-fn sum(&self) -> Array1
- // = partitions.par_iter().map(|p| p.sum()).reduce(element-wise +)
-```
-
-`p.sum()` is itself a reduction across layers (see above) — the cascade is preserved.
-
-### Self-contained metrics — single pass
-
-```rust
-fn bray_dist_matrix(&self) -> Array2 {
- let (sum_min, col_sums) = partitions
- .par_iter()
- .map(|p| p.partial_bray())
- .reduce(element-wise +);
- // finalise
- for (i,j): dist[i,j] = 1 - 2·sum_min[i,j] / (col_sums[i] + col_sums[j])
+trait BitPartials: ColumnWeights {
+ fn partial_jaccard(&self) -> (Array2, Array2);
+ fn partial_hamming(&self) -> Array2;
+ // provided
+ fn jaccard_dist_matrix(&self) -> Array2 { … }
+ fn hamming_dist_matrix(&self) -> Array2 { … }
}
```
-### Normalised metrics — two passes
+**Leaf implementors** (in `obicompactvec`):
+
+| Type | Traits |
+|---|---|
+| `PersistentCompactIntMatrix` | `ColumnWeights` (via `sum()`), `CountPartials` |
+| `PersistentBitMatrix` | `ColumnWeights` (via `count_ones()`), `BitPartials` |
+
+`PersistentCompactIntVec` and `PersistentBitVec` do **not** implement these traits — they are single-column primitives, not matrix-level aggregators.
+
+---
+
+## `LayeredStore` — `obilayeredmap`
+
+A single generic wrapper replaces the need for named `LayeredDataStore` and `PartitionedDataStore` types:
```rust
+pub struct LayeredStore(Vec);
+```
+
+Three blanket impls propagate the traits up the hierarchy:
+
+```rust
+impl ColumnWeights for LayeredStore { … } // Σ across inner stores
+impl CountPartials for LayeredStore { … } // same pattern
+impl BitPartials for LayeredStore { … } // same pattern
+```
+
+Because the blanket impl is recursive, **`LayeredStore>`** automatically inherits all three traits when `S` does — no separate `PartitionedStore` type is needed:
+
+```
+PersistentCompactIntMatrix implements CountPartials
+LayeredStore via blanket impl (= one partition)
+LayeredStore> via blanket impl (= partitioned index)
+```
+
+### Normalised metrics — two-pass cascade
+
+The normalised finalisation methods call `col_weights()` first (pass 1), then the normalised partial (pass 2). Both calls go through the same blanket impl, so the cascade is automatic:
+
+```rust
+// called on LayeredStore>
fn relfreq_bray_dist_matrix(&self) -> Array2 {
- // pass 1 — progressive: PartitionedDataStore::sum()
- // calls LayeredDataStore::sum() per partition (parallel)
- // calls PersistentCompactIntMatrix::sum() per layer (parallel)
- let global_sums = self.sum();
-
- // pass 2 — per-partition partial using global_sums (parallel)
- let matrix = partitions
- .par_iter()
- .map(|p| p.partial_relfreq_bray(&global_sums))
- .reduce(element-wise +);
- // finalise
- for (i,j): dist[i,j] = 1 - matrix[i,j]
+ let global = self.col_weights(); // pass 1 — progressive sum at every level
+ let p = self.partial_relfreq_bray(&global); // pass 2 — global passed in cascade
+ p.mapv(|v| 1.0 - v) // finalise (diagonal zeroed separately)
}
```
-`global_sums` is exact because each kmer belongs to exactly one (partition, layer) pair — no double-counting. Pass 1 is itself fully parallel at every level of the hierarchy.
+`global` is exact: each kmer belongs to exactly one `(partition, layer)` pair, so there is no double-counting across the hierarchy.
---
@@ -277,11 +254,13 @@ fn relfreq_bray_dist_matrix(&self) -> Array2 {
| Level | Unit | Coordination |
|---|---|---|
-| Across partitions | `LayeredDataStore` | none — fully independent |
-| Across layers (self-contained) | `(partition, layer)` pair | none — disjoint kmer sets |
-| Across layers (normalised, pass 1) | `(partition, layer)` pair | none — sums are additive |
-| Across layers (normalised, pass 2) | `(partition, layer)` pair | global_sums broadcast read-only |
-| Within a DataStore (distance matrix) | upper-triangle pair `(i,j)` | none — rayon par_iter |
+| Across partitions | `LayeredStore>` inner stores | none — fully independent |
+| Across layers within a partition | `LayeredStore` inner stores | none — disjoint kmer sets |
+| Normalised pass 1 (`col_weights`) | per inner store | none — additive |
+| Normalised pass 2 (partial) | per inner store | `global` broadcast read-only |
+| Within a matrix (distance) | upper-triangle pair `(i,j)` | none — rayon `par_iter` |
+
+All levels use rayon `par_iter` internally; `reduce_with` performs a parallel tree reduction.
---
@@ -331,16 +310,20 @@ Other derivations: threshold a count matrix → binary presence matrix; union tw
## Relationship to current implementation
-The current `obilayeredmap` crate implements a subset of this architecture. Key divergences:
+### What is implemented
-- `Layer` fuses `MphfLayer` and one `DataStore` into a single generic type. Multiple data stores on the same MPHF are not supported.
-- `LayerData::open(dir)` embeds the path convention (`counts/`, `presence/`) inside the store type, preventing the `PartitionedIndex` from managing paths externally.
-- `LayeredDataStore` and `PartitionedDataStore` do not yet exist; `LayeredMap` is a single-partition structure without a distance matrix API.
-- The partial distance methods exist on `PersistentCompactIntMatrix` and `PersistentBitMatrix` and are tested; they are not yet composed across layers and partitions.
+- **`obicompactvec::traits`**: `ColumnWeights`, `CountPartials`, `BitPartials` are defined and implemented on `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
+- **`obilayeredmap::LayeredStore`**: generic wrapper with blanket impls for all three traits. `LayeredStore>` is the partitioned level — no separate type needed. Tests confirm that splitting data across layers and across partitions gives the same distance matrices as computing on flat combined data.
+
+### What is not yet implemented
+
+- `Layer` still fuses `MphfLayer` and one `DataStore`. Multiple data stores on the same MPHF are not supported.
+- `LayeredMap` is a single-partition structure without distance matrix API; it does not yet use `LayeredStore`.
+- No `PartitionedIndex` type for point queries with parallel partition dispatch.
+
+### Planned refactoring
-Planned refactoring:
1. Extract `MphfLayer` from `Layer` as an autonomous type.
-2. Replace `LayerData` trait with `DataStore` trait (no path knowledge).
-3. Implement `LayeredCompactIntMatrix` / `LayeredBitMatrix` with the partial + full distance APIs described above.
-4. Implement `PartitionedCompactIntMatrix` / `PartitionedBitMatrix` with two-pass support for normalised metrics.
-5. Implement `PartitionedIndex` for point queries with parallel dispatch.
+2. Replace `LayerData` trait with the `DataStore` / `ColumnWeights` / `CountPartials` / `BitPartials` system.
+3. Rewire `LayeredMap` to hold `LayeredStore` (or bit variant) alongside the MPHF layers.
+4. Implement `PartitionedIndex` using `LayeredStore>` for data and parallel dispatch for queries.
diff --git a/src/Cargo.lock b/src/Cargo.lock
index 68a2b0d..b1de6b4 100644
--- a/src/Cargo.lock
+++ b/src/Cargo.lock
@@ -1788,6 +1788,7 @@ dependencies = [
"cacheline-ef",
"epserde 0.8.0",
"memmap2",
+ "ndarray",
"obicompactvec",
"obikseq",
"obiskio",
diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs
index f6e76b2..df823d1 100644
--- a/src/obicompactvec/src/bitmatrix.rs
+++ b/src/obicompactvec/src/bitmatrix.rs
@@ -117,6 +117,23 @@ where
m
}
+// ── Trait impls ───────────────────────────────────────────────────────────────
+
+use crate::traits::{BitPartials, ColumnWeights};
+
+impl ColumnWeights for PersistentBitMatrix {
+ fn col_weights(&self) -> Array1 { self.count_ones() }
+}
+
+impl BitPartials for PersistentBitMatrix {
+ fn partial_jaccard(&self) -> (Array2, Array2) {
+ self.partial_jaccard_dist_matrix()
+ }
+ fn partial_hamming(&self) -> Array2 {
+ self.partial_hamming_dist_matrix()
+ }
+}
+
// ── Builder ───────────────────────────────────────────────────────────────────
pub struct PersistentBitMatrixBuilder {
diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs
index 9ea70a8..c77a135 100644
--- a/src/obicompactvec/src/intmatrix.rs
+++ b/src/obicompactvec/src/intmatrix.rs
@@ -203,6 +203,35 @@ where
m
}
+// ── Trait impls ───────────────────────────────────────────────────────────────
+
+use crate::traits::{ColumnWeights, CountPartials};
+
+impl ColumnWeights for PersistentCompactIntMatrix {
+ fn col_weights(&self) -> Array1 { self.sum() }
+}
+
+impl CountPartials for PersistentCompactIntMatrix {
+ fn partial_bray(&self) -> Array2 {
+ self.partial_bray_dist_matrix()
+ }
+ fn partial_euclidean(&self) -> Array2 {
+ self.partial_euclidean_dist_matrix()
+ }
+ fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2, Array2) {
+ self.partial_threshold_jaccard_dist_matrix(threshold)
+ }
+ fn partial_relfreq_bray(&self, global: &Array1) -> Array2 {
+ self.partial_relfreq_bray_dist_matrix(global)
+ }
+ fn partial_relfreq_euclidean(&self, global: &Array1) -> Array2 {
+ self.partial_relfreq_euclidean_dist_matrix(global)
+ }
+ fn partial_hellinger(&self, global: &Array1) -> Array2 {
+ self.partial_hellinger_euclidean_dist_matrix(global)
+ }
+}
+
// ── Builder ───────────────────────────────────────────────────────────────────
pub struct PersistentCompactIntMatrixBuilder {
diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs
index df02fb8..270956f 100644
--- a/src/obicompactvec/src/lib.rs
+++ b/src/obicompactvec/src/lib.rs
@@ -5,12 +5,14 @@ mod format;
mod intmatrix;
mod meta;
mod reader;
+pub mod traits;
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
pub use builder::PersistentCompactIntVecBuilder;
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
pub use reader::PersistentCompactIntVec;
+pub use traits::{BitPartials, ColumnWeights, CountPartials};
#[cfg(test)]
#[path = "tests/mod.rs"]
diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs
new file mode 100644
index 0000000..bc10097
--- /dev/null
+++ b/src/obicompactvec/src/traits.rs
@@ -0,0 +1,113 @@
+use ndarray::{Array1, Array2};
+
+/// Column-level weight statistic — total count or presence count per column.
+/// Additive across layers and partitions; used as denominator in normalised distances.
+pub trait ColumnWeights: Send + Sync {
+ fn col_weights(&self) -> Array1;
+}
+
+/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
+///
+/// Every `partial_*` method returns an additive component: element-wise summing the results
+/// across layers then across partitions yields the global partial, from which the final
+/// distance is computed via the corresponding provided method.
+///
+/// Normalised methods (`partial_relfreq_*`, `partial_hellinger`) require the **global**
+/// `col_weights` (summed across all layers and partitions) as a parameter. The provided
+/// finalisation methods compute this in a first pass via `self.col_weights()`.
+pub trait CountPartials: ColumnWeights {
+ fn partial_bray(&self) -> Array2;
+ fn partial_euclidean(&self) -> Array2;
+ fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2, Array2);
+ fn partial_relfreq_bray(&self, global: &Array1) -> Array2;
+ fn partial_relfreq_euclidean(&self, global: &Array1) -> Array2;
+ fn partial_hellinger(&self, global: &Array1) -> Array2;
+
+ // ── Provided finalisation methods ─────────────────────────────────────────
+
+ fn bray_dist_matrix(&self) -> Array2 {
+ let sum_min = self.partial_bray();
+ let w = self.col_weights();
+ let n = w.len();
+ let mut m = Array2::::zeros((n, n));
+ for i in 0..n {
+ for j in 0..n {
+ if i != j {
+ let d = w[i] + w[j];
+ m[[i, j]] = if d == 0 { 0.0 }
+ else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / d as f64 };
+ }
+ }
+ }
+ m
+ }
+
+ fn euclidean_dist_matrix(&self) -> Array2 {
+ self.partial_euclidean().mapv(|v| v.sqrt())
+ }
+
+ fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2 {
+ let (inter, union) = self.partial_threshold_jaccard(threshold);
+ let n = inter.shape()[0];
+ let mut m = Array2::::zeros((n, n));
+ for i in 0..n {
+ for j in 0..n {
+ if i != j {
+ let u = union[[i, j]];
+ m[[i, j]] = if u == 0 { 0.0 }
+ else { 1.0 - inter[[i, j]] as f64 / u as f64 };
+ }
+ }
+ }
+ m
+ }
+
+ fn relfreq_bray_dist_matrix(&self) -> Array2 {
+ let global = self.col_weights();
+ let mut m = self.partial_relfreq_bray(&global).mapv(|v| 1.0 - v);
+ let n = m.shape()[0];
+ for i in 0..n { m[[i, i]] = 0.0; }
+ m
+ }
+
+ fn relfreq_euclidean_dist_matrix(&self) -> Array2 {
+ let global = self.col_weights();
+ self.partial_relfreq_euclidean(&global).mapv(|v| v.sqrt())
+ }
+
+ fn hellinger_dist_matrix(&self) -> Array2 {
+ let global = self.col_weights();
+ let sq2 = std::f64::consts::SQRT_2;
+ self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
+ }
+}
+
+/// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
+///
+/// Both `partial_*` methods are additively decomposable across layers and partitions.
+pub trait BitPartials: ColumnWeights {
+ fn partial_jaccard(&self) -> (Array2, Array2);
+ fn partial_hamming(&self) -> Array2;
+
+ // ── Provided finalisation methods ─────────────────────────────────────────
+
+ fn jaccard_dist_matrix(&self) -> Array2 {
+ let (inter, union) = self.partial_jaccard();
+ let n = inter.shape()[0];
+ let mut m = Array2::::zeros((n, n));
+ for i in 0..n {
+ for j in 0..n {
+ if i != j {
+ let u = union[[i, j]];
+ m[[i, j]] = if u == 0 { 0.0 }
+ else { 1.0 - inter[[i, j]] as f64 / u as f64 };
+ }
+ }
+ }
+ m
+ }
+
+ fn hamming_dist_matrix(&self) -> Array2 {
+ self.partial_hamming()
+ }
+}
diff --git a/src/obilayeredmap/Cargo.toml b/src/obilayeredmap/Cargo.toml
index 2964d07..1bb3d94 100644
--- a/src/obilayeredmap/Cargo.toml
+++ b/src/obilayeredmap/Cargo.toml
@@ -11,6 +11,7 @@ ptr_hash = "1.1"
cacheline-ef = "1.1"
epserde = "0.8"
rayon = "1"
+ndarray = "0.16"
memmap2 = "0.9"
serde = { version = "1", features = ["derive"] }
serde_json = "1"
diff --git a/src/obilayeredmap/src/layered_store.rs b/src/obilayeredmap/src/layered_store.rs
new file mode 100644
index 0000000..f25855d
--- /dev/null
+++ b/src/obilayeredmap/src/layered_store.rs
@@ -0,0 +1,257 @@
+use ndarray::{Array1, Array2};
+use rayon::prelude::*;
+
+use obicompactvec::traits::{BitPartials, ColumnWeights, CountPartials};
+
+/// A store that aggregates a `Vec` — one entry per layer (within a partition)
+/// or one entry per partition.
+///
+/// Blanket impls of `ColumnWeights`, `CountPartials`, and `BitPartials` propagate
+/// automatically: `LayeredStore>` implements the same traits as
+/// `LayeredStore`, giving the partitioned level for free.
+pub struct LayeredStore(pub Vec);
+
+impl LayeredStore {
+ pub fn new(layers: Vec) -> Self { Self(layers) }
+ pub fn layers(&self) -> &[S] { &self.0 }
+ pub fn n_layers(&self) -> usize { self.0.len() }
+ pub fn is_empty(&self) -> bool { self.0.is_empty() }
+}
+
+// ── ColumnWeights ─────────────────────────────────────────────────────────────
+
+impl ColumnWeights for LayeredStore {
+ fn col_weights(&self) -> Array1 {
+ self.0.par_iter()
+ .map(|s| s.col_weights())
+ .reduce_with(|a, b| a + b)
+ .unwrap_or_else(|| Array1::zeros(0))
+ }
+}
+
+// ── CountPartials ─────────────────────────────────────────────────────────────
+
+impl CountPartials for LayeredStore {
+ fn partial_bray(&self) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_bray())
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+
+ fn partial_euclidean(&self) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_euclidean())
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+
+ fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2, Array2) {
+ self.0.par_iter()
+ .map(|s| s.partial_threshold_jaccard(threshold))
+ .reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
+ .unwrap()
+ }
+
+ fn partial_relfreq_bray(&self, global: &Array1) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_relfreq_bray(global))
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+
+ fn partial_relfreq_euclidean(&self, global: &Array1) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_relfreq_euclidean(global))
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+
+ fn partial_hellinger(&self, global: &Array1) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_hellinger(global))
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+}
+
+// ── BitPartials ───────────────────────────────────────────────────────────────
+
+impl BitPartials for LayeredStore {
+ fn partial_jaccard(&self) -> (Array2, Array2) {
+ self.0.par_iter()
+ .map(|s| s.partial_jaccard())
+ .reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
+ .unwrap()
+ }
+
+ fn partial_hamming(&self) -> Array2 {
+ self.0.par_iter()
+ .map(|s| s.partial_hamming())
+ .reduce_with(|a, b| a + b)
+ .unwrap()
+ }
+}
+
+// ── Tests ─────────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use obicompactvec::{
+ PersistentBitMatrix, PersistentBitMatrixBuilder,
+ PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
+ };
+ use tempfile::tempdir;
+
+ fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
+ let n = cols.first().map_or(0, |c| c.len());
+ let dir = tempdir().unwrap();
+ let mut b = PersistentCompactIntMatrixBuilder::new(n, dir.path()).unwrap();
+ for &col in cols {
+ let mut cb = b.add_col().unwrap();
+ for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+ cb.close().unwrap();
+ }
+ b.close().unwrap();
+ let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
+ (dir, m)
+ }
+
+ fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
+ let n = cols.first().map_or(0, |c| c.len());
+ let dir = tempdir().unwrap();
+ let mut b = PersistentBitMatrixBuilder::new(n, dir.path()).unwrap();
+ for &col in cols {
+ let mut cb = b.add_col().unwrap();
+ for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+ cb.close().unwrap();
+ }
+ b.close().unwrap();
+ let m = PersistentBitMatrix::open(dir.path()).unwrap();
+ (dir, m)
+ }
+
+ // ── ColumnWeights ─────────────────────────────────────────────────────────
+
+ #[test]
+ fn col_weights_sums_across_layers() {
+ // layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
+ // layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
+ // combined: [13, 17]
+ let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
+ let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+ let w = store.col_weights();
+ assert_eq!(w[0], 13);
+ assert_eq!(w[1], 17);
+ }
+
+ #[test]
+ fn col_weights_bit_sums_across_layers() {
+ // layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
+ // layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
+ // combined: [3, 4]
+ let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
+ let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+ let w = store.col_weights();
+ assert_eq!(w[0], 3);
+ assert_eq!(w[1], 4);
+ }
+
+ // ── CountPartials — layered (one partition) ───────────────────────────────
+
+ #[test]
+ fn layered_bray_matches_combined() {
+ // Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
+ // on [1,2,3,4,5] for each column pair.
+ // col0=[1,2,3,4,5], col1=[5,4,3,2,1]
+ let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
+ let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
+ let store = LayeredStore::new(vec![m0, m1]);
+
+ // direct on full data
+ let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
+ let expected = CountPartials::bray_dist_matrix(&mf);
+ let got = CountPartials::bray_dist_matrix(&store);
+ assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
+ assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
+ }
+
+ #[test]
+ fn layered_relfreq_bray_matches_combined() {
+ let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
+ let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+
+ let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
+ let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
+ let got = CountPartials::relfreq_bray_dist_matrix(&store);
+ assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
+ }
+
+ #[test]
+ fn layered_euclidean_matches_combined() {
+ let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
+ let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+
+ let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
+ let expected = CountPartials::euclidean_dist_matrix(&mf);
+ let got = CountPartials::euclidean_dist_matrix(&store);
+ assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
+ }
+
+ // ── CountPartials — partitioned (LayeredStore>) ───────────
+
+ #[test]
+ fn partitioned_bray_matches_combined() {
+ // partition 0: slots [1,2,3,4,5] col0 vs col1
+ // partition 1: slots [10,20] col0 vs col1
+ let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
+ let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
+
+ let partitioned = LayeredStore::new(vec![
+ LayeredStore::new(vec![p0]),
+ LayeredStore::new(vec![p1]),
+ ]);
+
+ let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
+ let expected = CountPartials::bray_dist_matrix(&mf);
+ let got = CountPartials::bray_dist_matrix(&partitioned);
+ assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
+ }
+
+ // ── BitPartials ───────────────────────────────────────────────────────────
+
+ #[test]
+ fn layered_jaccard_matches_combined() {
+ let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
+ let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+
+ let (_df, mf) = make_bit_matrix(&[
+ &[true, false, true, true],
+ &[false, true, true, false],
+ ]);
+ let expected = BitPartials::jaccard_dist_matrix(&mf);
+ let got = BitPartials::jaccard_dist_matrix(&store);
+ assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
+ }
+
+ #[test]
+ fn layered_hamming_matches_combined() {
+ let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
+ let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
+ let store = LayeredStore::new(vec![m0, m1]);
+
+ let (_df, mf) = make_bit_matrix(&[
+ &[true, false, true, true],
+ &[false, true, false, false],
+ ]);
+ let expected = BitPartials::hamming_dist_matrix(&mf);
+ let got = BitPartials::hamming_dist_matrix(&store);
+ assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
+ }
+}
diff --git a/src/obilayeredmap/src/lib.rs b/src/obilayeredmap/src/lib.rs
index 0872bdc..32b08d1 100644
--- a/src/obilayeredmap/src/lib.rs
+++ b/src/obilayeredmap/src/lib.rs
@@ -1,9 +1,11 @@
pub mod error;
pub mod evidence;
pub mod layer;
+pub mod layered_store;
pub mod map;
pub mod meta;
pub use error::{OLMError, OLMResult};
pub use layer::{Hit, Layer, LayerData};
+pub use layered_store::LayeredStore;
pub use map::LayeredMap;