feat: introduce trait-based distance aggregation and layered store
Introduces ColumnWeights, CountPartials, and BitPartials traits to compute and finalize partial distance matrices. Implements these traits for PersistentBitMatrix, PersistentCompactIntMatrix, and a new LayeredStore<S> wrapper that aggregates metrics across layers via parallel reduction. Adds ndarray for numerical aggregation and updates architecture documentation to reflect the trait-driven design and pending refactoring roadmap.
This commit is contained in:
@@ -968,94 +968,33 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#layereddatastore-aggregation-within-one-partition" class="md-nav__link">
|
||||
<a href="#traits-obicompactvectraits" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
LayeredDataStore — aggregation within one partition
|
||||
Traits — obicompactvec::traits
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="LayeredDataStore — aggregation within one partition">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#column-statistics" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Column statistics
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#self-contained-partials" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Self-contained partials
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalised-partials-require-global-sums-from-above" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalised partials (require global sums from above)
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#partitioneddatastore-aggregation-across-all-partitions" class="md-nav__link">
|
||||
<a href="#layeredstores-obilayeredmap" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
PartitionedDataStore — aggregation across all partitions
|
||||
LayeredStore<S> — obilayeredmap
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="PartitionedDataStore — aggregation across all partitions">
|
||||
<nav class="md-nav" aria-label="LayeredStore<S> — obilayeredmap">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#column-statistics_1" class="md-nav__link">
|
||||
<a href="#normalised-metrics-two-pass-cascade" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Column statistics
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#self-contained-metrics-single-pass" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Self-contained metrics — single pass
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalised-metrics-two-passes" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalised metrics — two passes
|
||||
Normalised metrics — two-pass cascade
|
||||
|
||||
</span>
|
||||
</a>
|
||||
@@ -1137,6 +1076,45 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Relationship to current implementation">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#what-is-implemented" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
What is implemented
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#what-is-not-yet-implemented" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
What is not yet implemented
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#planned-refactoring" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Planned refactoring
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
@@ -1276,94 +1254,33 @@
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#layereddatastore-aggregation-within-one-partition" class="md-nav__link">
|
||||
<a href="#traits-obicompactvectraits" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
LayeredDataStore — aggregation within one partition
|
||||
Traits — obicompactvec::traits
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="LayeredDataStore — aggregation within one partition">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#column-statistics" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Column statistics
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#self-contained-partials" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Self-contained partials
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalised-partials-require-global-sums-from-above" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalised partials (require global sums from above)
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#partitioneddatastore-aggregation-across-all-partitions" class="md-nav__link">
|
||||
<a href="#layeredstores-obilayeredmap" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
PartitionedDataStore — aggregation across all partitions
|
||||
LayeredStore<S> — obilayeredmap
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="PartitionedDataStore — aggregation across all partitions">
|
||||
<nav class="md-nav" aria-label="LayeredStore<S> — obilayeredmap">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#column-statistics_1" class="md-nav__link">
|
||||
<a href="#normalised-metrics-two-pass-cascade" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Column statistics
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#self-contained-metrics-single-pass" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Self-contained metrics — single pass
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalised-metrics-two-passes" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalised metrics — two passes
|
||||
Normalised metrics — two-pass cascade
|
||||
|
||||
</span>
|
||||
</a>
|
||||
@@ -1445,6 +1362,45 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Relationship to current implementation">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#what-is-implemented" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
What is implemented
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#what-is-not-yet-implemented" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
What is not yet implemented
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#planned-refactoring" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Planned refactoring
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
@@ -1581,99 +1537,98 @@
|
||||
<hr />
|
||||
<h2 id="progressive-aggregation-principle">Progressive aggregation principle</h2>
|
||||
<p>Aggregation is <strong>hierarchical</strong>: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.</p>
|
||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix::sum() — column sums for one (partition, layer) matrix
|
||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix::col_weights() — column sums for one (partition, layer) matrix
|
||||
↓ Σ across layers
|
||||
LayeredCompactIntMatrix::sum() — column sums for one partition
|
||||
LayeredStore<PersistentCompactIntMatrix>::col_weights() — column sums for one partition
|
||||
↓ Σ across partitions
|
||||
PartitionedCompactIntMatrix::sum() — global column sums
|
||||
LayeredStore<LayeredStore<…>>::col_weights() — global column sums
|
||||
</code></pre></div>
|
||||
<p>The same cascade applies to every partial computation:</p>
|
||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix::partial_bray_dist_matrix() — one (partition, layer)
|
||||
<p>The same cascade applies to every partial:</p>
|
||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix::partial_bray() — one (partition, layer)
|
||||
↓ element-wise Σ across layers
|
||||
LayeredCompactIntMatrix::partial_bray() — one partition
|
||||
LayeredStore<PersistentCompactIntMatrix>::partial_bray() — one partition
|
||||
↓ element-wise Σ across partitions
|
||||
PartitionedCompactIntMatrix::partial_bray() — global partial → final dist
|
||||
LayeredStore<LayeredStore<…>>::partial_bray() — global partial → final dist
|
||||
</code></pre></div>
|
||||
<p>This means <code>LayeredCompactIntMatrix</code> never inspects individual <code>PersistentCompactIntVec</code> columns directly, and <code>PartitionedCompactIntMatrix</code> never inspects individual layers. Each level presents a stable API surface to the level above.</p>
|
||||
<p>Each level presents a stable trait surface to the level above; no level reaches two levels down.</p>
|
||||
<hr />
|
||||
<h2 id="layereddatastore-aggregation-within-one-partition">LayeredDataStore — aggregation within one partition</h2>
|
||||
<p>A <code>LayeredDataStore</code> holds one <code>DataStore</code> per layer within a single partition:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">LayeredCompactIntMatrix</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">layers</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">struct</span><span class="w"> </span><span class="nc">LayeredBitMatrix</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">layers</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<h3 id="column-statistics">Column statistics</h3>
|
||||
<div class="highlight"><pre><span></span><code><span class="c1">// LayeredCompactIntMatrix</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">sum</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// = layers.par_iter().map(|m| m.sum()).reduce(element-wise +)</span>
|
||||
<h2 id="traits-obicompactvectraits">Traits — <code>obicompactvec::traits</code></h2>
|
||||
<p>Three traits unify the aggregation API across all levels of the hierarchy.</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">trait</span><span class="w"> </span><span class="n">ColumnWeights</span><span class="p">:</span><span class="w"> </span><span class="nb">Send</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="nb">Sync</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">col_weights</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="p">}</span>
|
||||
|
||||
<span class="c1">// LayeredBitMatrix</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">count_ones</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// = layers.par_iter().map(|m| m.count_ones()).reduce(element-wise +)</span>
|
||||
</code></pre></div>
|
||||
<h3 id="self-contained-partials">Self-contained partials</h3>
|
||||
<p>Each method reduces across layers by element-wise addition of per-layer matrices:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">partial_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_bray_dist_matrix()</span>
|
||||
<span class="k">trait</span><span class="w"> </span><span class="n">CountPartials</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="c1">// self-contained partials (additive, no parameter)</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_threshold_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">threshold</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="c1">// normalised partials (global col_weights passed in cascade)</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hellinger</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="c1">// provided finalisation methods (default implementations)</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">euclidean_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">threshold_jaccard_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">threshold</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_euclidean_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">hellinger_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="p">}</span>
|
||||
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">partial_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_euclidean_dist_matrix()</span>
|
||||
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">partial_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_jaccard_dist_matrix() [bit matrix]</span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_threshold_jaccard_dist_matrix() [int matrix]</span>
|
||||
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">partial_hamming</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_hamming_dist_matrix() [bit matrix]</span>
|
||||
</code></pre></div>
|
||||
<h3 id="normalised-partials-require-global-sums-from-above">Normalised partials (require global sums from above)</h3>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global_sums</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_relfreq_bray_dist_matrix(global_sums)</span>
|
||||
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global_sums</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_relfreq_euclidean_dist_matrix(global_sums)</span>
|
||||
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">partial_hellinger</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global_sums</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// Σ_l layer_l.partial_hellinger_euclidean_dist_matrix(global_sums)</span>
|
||||
</code></pre></div>
|
||||
<p><code>global_sums</code> is provided by the <code>PartitionedDataStore</code>; this level does not compute it.</p>
|
||||
<hr />
|
||||
<h2 id="partitioneddatastore-aggregation-across-all-partitions">PartitionedDataStore — aggregation across all partitions</h2>
|
||||
<p>A <code>PartitionedDataStore</code> holds one <code>LayeredDataStore</code> per partition:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PartitionedCompactIntMatrix</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">partitions</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">LayeredCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">struct</span><span class="w"> </span><span class="nc">PartitionedBitMatrix</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">partitions</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">LayeredBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<h3 id="column-statistics_1">Column statistics</h3>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">sum</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span>
|
||||
<span class="w"> </span><span class="c1">// = partitions.par_iter().map(|p| p.sum()).reduce(element-wise +)</span>
|
||||
</code></pre></div>
|
||||
<p><code>p.sum()</code> is itself a reduction across layers (see above) — the cascade is preserved.</p>
|
||||
<h3 id="self-contained-metrics-single-pass">Self-contained metrics — single pass</h3>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="p">(</span><span class="n">sum_min</span><span class="p">,</span><span class="w"> </span><span class="n">col_sums</span><span class="p">)</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">partitions</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">par_iter</span><span class="p">()</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">map</span><span class="p">(</span><span class="o">|</span><span class="n">p</span><span class="o">|</span><span class="w"> </span><span class="n">p</span><span class="p">.</span><span class="n">partial_bray</span><span class="p">())</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">element</span><span class="o">-</span><span class="n">wise</span><span class="w"> </span><span class="o">+</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="c1">// finalise</span>
|
||||
<span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">):</span><span class="w"> </span><span class="nc">dist</span><span class="p">[</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="err">·</span><span class="n">sum_min</span><span class="p">[</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">]</span><span class="w"> </span><span class="o">/</span><span class="w"> </span><span class="p">(</span><span class="n">col_sums</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">col_sums</span><span class="p">[</span><span class="n">j</span><span class="p">])</span>
|
||||
<span class="k">trait</span><span class="w"> </span><span class="n">BitPartials</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hamming</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
||||
<span class="w"> </span><span class="c1">// provided</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">jaccard_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">hamming_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<h3 id="normalised-metrics-two-passes">Normalised metrics — two passes</h3>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="c1">// pass 1 — progressive: PartitionedDataStore::sum()</span>
|
||||
<span class="w"> </span><span class="c1">// calls LayeredDataStore::sum() per partition (parallel)</span>
|
||||
<span class="w"> </span><span class="c1">// calls PersistentCompactIntMatrix::sum() per layer (parallel)</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">global_sums</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">sum</span><span class="p">();</span>
|
||||
|
||||
<span class="w"> </span><span class="c1">// pass 2 — per-partition partial using global_sums (parallel)</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">matrix</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">partitions</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">par_iter</span><span class="p">()</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">map</span><span class="p">(</span><span class="o">|</span><span class="n">p</span><span class="o">|</span><span class="w"> </span><span class="n">p</span><span class="p">.</span><span class="n">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="n">global_sums</span><span class="p">))</span>
|
||||
<span class="w"> </span><span class="p">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">element</span><span class="o">-</span><span class="n">wise</span><span class="w"> </span><span class="o">+</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="c1">// finalise</span>
|
||||
<span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="p">(</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">):</span><span class="w"> </span><span class="nc">dist</span><span class="p">[</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">matrix</span><span class="p">[</span><span class="n">i</span><span class="p">,</span><span class="n">j</span><span class="p">]</span>
|
||||
<p><strong>Leaf implementors</strong> (in <code>obicompactvec</code>):</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Type</th>
|
||||
<th>Traits</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><code>PersistentCompactIntMatrix</code></td>
|
||||
<td><code>ColumnWeights</code> (via <code>sum()</code>), <code>CountPartials</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>PersistentBitMatrix</code></td>
|
||||
<td><code>ColumnWeights</code> (via <code>count_ones()</code>), <code>BitPartials</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p><code>PersistentCompactIntVec</code> and <code>PersistentBitVec</code> do <strong>not</strong> implement these traits — they are single-column primitives, not matrix-level aggregators.</p>
|
||||
<hr />
|
||||
<h2 id="layeredstores-obilayeredmap"><code>LayeredStore<S></code> — <code>obilayeredmap</code></h2>
|
||||
<p>A single generic wrapper replaces the need for named <code>LayeredDataStore</code> and <code>PartitionedDataStore</code> types:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="p">(</span><span class="nb">Vec</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="p">);</span>
|
||||
</code></pre></div>
|
||||
<p>Three blanket impls propagate the traits up the hierarchy:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="o">></span><span class="w"> </span><span class="n">ColumnWeights</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="c1">// Σ across inner stores</span>
|
||||
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">CountPartials</span><span class="o">></span><span class="w"> </span><span class="n">CountPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="c1">// same pattern</span>
|
||||
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">BitPartials</span><span class="o">></span><span class="w"> </span><span class="n">BitPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="c1">// same pattern</span>
|
||||
</code></pre></div>
|
||||
<p>Because the blanket impl is recursive, <strong><code>LayeredStore<LayeredStore<S>></code></strong> automatically inherits all three traits when <code>S</code> does — no separate <code>PartitionedStore</code> type is needed:</p>
|
||||
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix implements CountPartials
|
||||
LayeredStore<PersistentCompactIntMatrix> via blanket impl (= one partition)
|
||||
LayeredStore<LayeredStore<…>> via blanket impl (= partitioned index)
|
||||
</code></pre></div>
|
||||
<h3 id="normalised-metrics-two-pass-cascade">Normalised metrics — two-pass cascade</h3>
|
||||
<p>The normalised finalisation methods call <code>col_weights()</code> first (pass 1), then the normalised partial (pass 2). Both calls go through the same blanket impl, so the cascade is automatic:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="c1">// called on LayeredStore<LayeredStore<PersistentCompactIntMatrix>></span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">global</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">col_weights</span><span class="p">();</span><span class="w"> </span><span class="c1">// pass 1 — progressive sum at every level</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">p</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="n">global</span><span class="p">);</span><span class="w"> </span><span class="c1">// pass 2 — global passed in cascade</span>
|
||||
<span class="w"> </span><span class="n">p</span><span class="p">.</span><span class="n">mapv</span><span class="p">(</span><span class="o">|</span><span class="n">v</span><span class="o">|</span><span class="w"> </span><span class="mf">1.0</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">v</span><span class="p">)</span><span class="w"> </span><span class="c1">// finalise (diagonal zeroed separately)</span>
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p><code>global_sums</code> is exact because each kmer belongs to exactly one (partition, layer) pair — no double-counting. Pass 1 is itself fully parallel at every level of the hierarchy.</p>
|
||||
<p><code>global</code> is exact: each kmer belongs to exactly one <code>(partition, layer)</code> pair, so there is no double-counting across the hierarchy.</p>
|
||||
<hr />
|
||||
<h2 id="parallelism-model">Parallelism model</h2>
|
||||
<table>
|
||||
@@ -1687,31 +1642,32 @@ PartitionedCompactIntMatrix::partial_bray() — global partial →
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>Across partitions</td>
|
||||
<td><code>LayeredDataStore</code></td>
|
||||
<td><code>LayeredStore<LayeredStore<S>></code> inner stores</td>
|
||||
<td>none — fully independent</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Across layers (self-contained)</td>
|
||||
<td><code>(partition, layer)</code> pair</td>
|
||||
<td>Across layers within a partition</td>
|
||||
<td><code>LayeredStore<S></code> inner stores</td>
|
||||
<td>none — disjoint kmer sets</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Across layers (normalised, pass 1)</td>
|
||||
<td><code>(partition, layer)</code> pair</td>
|
||||
<td>none — sums are additive</td>
|
||||
<td>Normalised pass 1 (<code>col_weights</code>)</td>
|
||||
<td>per inner store</td>
|
||||
<td>none — additive</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Across layers (normalised, pass 2)</td>
|
||||
<td><code>(partition, layer)</code> pair</td>
|
||||
<td>global_sums broadcast read-only</td>
|
||||
<td>Normalised pass 2 (partial)</td>
|
||||
<td>per inner store</td>
|
||||
<td><code>global</code> broadcast read-only</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Within a DataStore (distance matrix)</td>
|
||||
<td>Within a matrix (distance)</td>
|
||||
<td>upper-triangle pair <code>(i,j)</code></td>
|
||||
<td>none — rayon par_iter</td>
|
||||
<td>none — rayon <code>par_iter</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>All levels use rayon <code>par_iter</code> internally; <code>reduce_with</code> performs a parallel tree reduction.</p>
|
||||
<hr />
|
||||
<h2 id="query-model">Query model</h2>
|
||||
<h3 id="point-query-kmer-optionitem">Point query — <code>kmer → Option<Item></code></h3>
|
||||
@@ -1742,19 +1698,24 @@ for (p, l) in all_partition_layer_pairs().par_iter():
|
||||
<p>Other derivations: threshold a count matrix → binary presence matrix; union two presence matrices; merge two count matrices (saturating add, column-wise). All are local to one <code>(partition, layer)</code> pair.</p>
|
||||
<hr />
|
||||
<h2 id="relationship-to-current-implementation">Relationship to current implementation</h2>
|
||||
<p>The current <code>obilayeredmap</code> crate implements a subset of this architecture. Key divergences:</p>
|
||||
<h3 id="what-is-implemented">What is implemented</h3>
|
||||
<ul>
|
||||
<li><code>Layer<D: LayerData></code> fuses <code>MphfLayer</code> and one <code>DataStore</code> into a single generic type. Multiple data stores on the same MPHF are not supported.</li>
|
||||
<li><code>LayerData::open(dir)</code> embeds the path convention (<code>counts/</code>, <code>presence/</code>) inside the store type, preventing the <code>PartitionedIndex</code> from managing paths externally.</li>
|
||||
<li><code>LayeredDataStore</code> and <code>PartitionedDataStore</code> do not yet exist; <code>LayeredMap</code> is a single-partition structure without a distance matrix API.</li>
|
||||
<li>The partial distance methods exist on <code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code> and are tested; they are not yet composed across layers and partitions.</li>
|
||||
<li><strong><code>obicompactvec::traits</code></strong>: <code>ColumnWeights</code>, <code>CountPartials</code>, <code>BitPartials</code> are defined and implemented on <code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code>.</li>
|
||||
<li><strong><code>obilayeredmap::LayeredStore<S></code></strong>: generic wrapper with blanket impls for all three traits. <code>LayeredStore<LayeredStore<S>></code> is the partitioned level — no separate type needed. Tests confirm that splitting data across layers and across partitions gives the same distance matrices as computing on flat combined data.</li>
|
||||
</ul>
|
||||
<p>Planned refactoring:
|
||||
1. Extract <code>MphfLayer</code> from <code>Layer<D></code> as an autonomous type.
|
||||
2. Replace <code>LayerData</code> trait with <code>DataStore</code> trait (no path knowledge).
|
||||
3. Implement <code>LayeredCompactIntMatrix</code> / <code>LayeredBitMatrix</code> with the partial + full distance APIs described above.
|
||||
4. Implement <code>PartitionedCompactIntMatrix</code> / <code>PartitionedBitMatrix</code> with two-pass support for normalised metrics.
|
||||
5. Implement <code>PartitionedIndex</code> for point queries with parallel dispatch.</p>
|
||||
<h3 id="what-is-not-yet-implemented">What is not yet implemented</h3>
|
||||
<ul>
|
||||
<li><code>Layer<D: LayerData></code> still fuses <code>MphfLayer</code> and one <code>DataStore</code>. Multiple data stores on the same MPHF are not supported.</li>
|
||||
<li><code>LayeredMap</code> is a single-partition structure without distance matrix API; it does not yet use <code>LayeredStore</code>.</li>
|
||||
<li>No <code>PartitionedIndex</code> type for point queries with parallel partition dispatch.</li>
|
||||
</ul>
|
||||
<h3 id="planned-refactoring">Planned refactoring</h3>
|
||||
<ol>
|
||||
<li>Extract <code>MphfLayer</code> from <code>Layer<D></code> as an autonomous type.</li>
|
||||
<li>Replace <code>LayerData</code> trait with the <code>DataStore</code> / <code>ColumnWeights</code> / <code>CountPartials</code> / <code>BitPartials</code> system.</li>
|
||||
<li>Rewire <code>LayeredMap</code> to hold <code>LayeredStore<PersistentCompactIntMatrix></code> (or bit variant) alongside the MPHF layers.</li>
|
||||
<li>Implement <code>PartitionedIndex</code> using <code>LayeredStore<LayeredStore<S>></code> for data and parallel dispatch for queries.</li>
|
||||
</ol>
|
||||
|
||||
|
||||
|
||||
|
||||
Binary file not shown.
@@ -141,135 +141,112 @@ The `col_sums` parameter must reflect the GLOBAL count across all layers and all
|
||||
Aggregation is **hierarchical**: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.
|
||||
|
||||
```
|
||||
PersistentCompactIntMatrix::sum() — column sums for one (partition, layer) matrix
|
||||
PersistentCompactIntMatrix::col_weights() — column sums for one (partition, layer) matrix
|
||||
↓ Σ across layers
|
||||
LayeredCompactIntMatrix::sum() — column sums for one partition
|
||||
LayeredStore<PersistentCompactIntMatrix>::col_weights() — column sums for one partition
|
||||
↓ Σ across partitions
|
||||
PartitionedCompactIntMatrix::sum() — global column sums
|
||||
LayeredStore<LayeredStore<…>>::col_weights() — global column sums
|
||||
```
|
||||
|
||||
The same cascade applies to every partial computation:
|
||||
The same cascade applies to every partial:
|
||||
|
||||
```
|
||||
PersistentCompactIntMatrix::partial_bray_dist_matrix() — one (partition, layer)
|
||||
PersistentCompactIntMatrix::partial_bray() — one (partition, layer)
|
||||
↓ element-wise Σ across layers
|
||||
LayeredCompactIntMatrix::partial_bray() — one partition
|
||||
LayeredStore<PersistentCompactIntMatrix>::partial_bray() — one partition
|
||||
↓ element-wise Σ across partitions
|
||||
PartitionedCompactIntMatrix::partial_bray() — global partial → final dist
|
||||
LayeredStore<LayeredStore<…>>::partial_bray() — global partial → final dist
|
||||
```
|
||||
|
||||
This means `LayeredCompactIntMatrix` never inspects individual `PersistentCompactIntVec` columns directly, and `PartitionedCompactIntMatrix` never inspects individual layers. Each level presents a stable API surface to the level above.
|
||||
Each level presents a stable trait surface to the level above; no level reaches two levels down.
|
||||
|
||||
---
|
||||
|
||||
## LayeredDataStore — aggregation within one partition
|
||||
## Traits — `obicompactvec::traits`
|
||||
|
||||
A `LayeredDataStore` holds one `DataStore` per layer within a single partition:
|
||||
Three traits unify the aggregation API across all levels of the hierarchy.
|
||||
|
||||
```rust
|
||||
struct LayeredCompactIntMatrix { layers: Vec<PersistentCompactIntMatrix> }
|
||||
struct LayeredBitMatrix { layers: Vec<PersistentBitMatrix> }
|
||||
```
|
||||
trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
}
|
||||
|
||||
### Column statistics
|
||||
trait CountPartials: ColumnWeights {
|
||||
// self-contained partials (additive, no parameter)
|
||||
fn partial_bray(&self) -> Array2<u64>;
|
||||
fn partial_euclidean(&self) -> Array2<f64>;
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
||||
// normalised partials (global col_weights passed in cascade)
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
// provided finalisation methods (default implementations)
|
||||
fn bray_dist_matrix(&self) -> Array2<f64> { … }
|
||||
fn euclidean_dist_matrix(&self) -> Array2<f64> { … }
|
||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> { … }
|
||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> { … }
|
||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> { … }
|
||||
fn hellinger_dist_matrix(&self) -> Array2<f64> { … }
|
||||
}
|
||||
|
||||
```rust
|
||||
// LayeredCompactIntMatrix
|
||||
fn sum(&self) -> Array1<u64>
|
||||
// = layers.par_iter().map(|m| m.sum()).reduce(element-wise +)
|
||||
|
||||
// LayeredBitMatrix
|
||||
fn count_ones(&self) -> Array1<u64>
|
||||
// = layers.par_iter().map(|m| m.count_ones()).reduce(element-wise +)
|
||||
```
|
||||
|
||||
### Self-contained partials
|
||||
|
||||
Each method reduces across layers by element-wise addition of per-layer matrices:
|
||||
|
||||
```rust
|
||||
fn partial_bray(&self) -> (Array2<u64>, Array1<u64>)
|
||||
// Σ_l layer_l.partial_bray_dist_matrix()
|
||||
|
||||
fn partial_euclidean(&self) -> Array2<f64>
|
||||
// Σ_l layer_l.partial_euclidean_dist_matrix()
|
||||
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>)
|
||||
// Σ_l layer_l.partial_jaccard_dist_matrix() [bit matrix]
|
||||
// Σ_l layer_l.partial_threshold_jaccard_dist_matrix() [int matrix]
|
||||
|
||||
fn partial_hamming(&self) -> Array2<u64>
|
||||
// Σ_l layer_l.partial_hamming_dist_matrix() [bit matrix]
|
||||
```
|
||||
|
||||
### Normalised partials (require global sums from above)
|
||||
|
||||
```rust
|
||||
fn partial_relfreq_bray(&self, global_sums: &Array1<u64>) -> Array2<f64>
|
||||
// Σ_l layer_l.partial_relfreq_bray_dist_matrix(global_sums)
|
||||
|
||||
fn partial_relfreq_euclidean(&self, global_sums: &Array1<u64>) -> Array2<f64>
|
||||
// Σ_l layer_l.partial_relfreq_euclidean_dist_matrix(global_sums)
|
||||
|
||||
fn partial_hellinger(&self, global_sums: &Array1<u64>) -> Array2<f64>
|
||||
// Σ_l layer_l.partial_hellinger_euclidean_dist_matrix(global_sums)
|
||||
```
|
||||
|
||||
`global_sums` is provided by the `PartitionedDataStore`; this level does not compute it.
|
||||
|
||||
---
|
||||
|
||||
## PartitionedDataStore — aggregation across all partitions
|
||||
|
||||
A `PartitionedDataStore` holds one `LayeredDataStore` per partition:
|
||||
|
||||
```rust
|
||||
struct PartitionedCompactIntMatrix { partitions: Vec<LayeredCompactIntMatrix> }
|
||||
struct PartitionedBitMatrix { partitions: Vec<LayeredBitMatrix> }
|
||||
```
|
||||
|
||||
### Column statistics
|
||||
|
||||
```rust
|
||||
fn sum(&self) -> Array1<u64>
|
||||
// = partitions.par_iter().map(|p| p.sum()).reduce(element-wise +)
|
||||
```
|
||||
|
||||
`p.sum()` is itself a reduction across layers (see above) — the cascade is preserved.
|
||||
|
||||
### Self-contained metrics — single pass
|
||||
|
||||
```rust
|
||||
fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let (sum_min, col_sums) = partitions
|
||||
.par_iter()
|
||||
.map(|p| p.partial_bray())
|
||||
.reduce(element-wise +);
|
||||
// finalise
|
||||
for (i,j): dist[i,j] = 1 - 2·sum_min[i,j] / (col_sums[i] + col_sums[j])
|
||||
trait BitPartials: ColumnWeights {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_hamming(&self) -> Array2<u64>;
|
||||
// provided
|
||||
fn jaccard_dist_matrix(&self) -> Array2<f64> { … }
|
||||
fn hamming_dist_matrix(&self) -> Array2<u64> { … }
|
||||
}
|
||||
```
|
||||
|
||||
### Normalised metrics — two passes
|
||||
**Leaf implementors** (in `obicompactvec`):
|
||||
|
||||
| Type | Traits |
|
||||
|---|---|
|
||||
| `PersistentCompactIntMatrix` | `ColumnWeights` (via `sum()`), `CountPartials` |
|
||||
| `PersistentBitMatrix` | `ColumnWeights` (via `count_ones()`), `BitPartials` |
|
||||
|
||||
`PersistentCompactIntVec` and `PersistentBitVec` do **not** implement these traits — they are single-column primitives, not matrix-level aggregators.
|
||||
|
||||
---
|
||||
|
||||
## `LayeredStore<S>` — `obilayeredmap`
|
||||
|
||||
A single generic wrapper replaces the need for named `LayeredDataStore` and `PartitionedDataStore` types:
|
||||
|
||||
```rust
|
||||
pub struct LayeredStore<S>(Vec<S>);
|
||||
```
|
||||
|
||||
Three blanket impls propagate the traits up the hierarchy:
|
||||
|
||||
```rust
|
||||
impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> { … } // Σ across inner stores
|
||||
impl<S: CountPartials> CountPartials for LayeredStore<S> { … } // same pattern
|
||||
impl<S: BitPartials> BitPartials for LayeredStore<S> { … } // same pattern
|
||||
```
|
||||
|
||||
Because the blanket impl is recursive, **`LayeredStore<LayeredStore<S>>`** automatically inherits all three traits when `S` does — no separate `PartitionedStore` type is needed:
|
||||
|
||||
```
|
||||
PersistentCompactIntMatrix implements CountPartials
|
||||
LayeredStore<PersistentCompactIntMatrix> via blanket impl (= one partition)
|
||||
LayeredStore<LayeredStore<…>> via blanket impl (= partitioned index)
|
||||
```
|
||||
|
||||
### Normalised metrics — two-pass cascade
|
||||
|
||||
The normalised finalisation methods call `col_weights()` first (pass 1), then the normalised partial (pass 2). Both calls go through the same blanket impl, so the cascade is automatic:
|
||||
|
||||
```rust
|
||||
// called on LayeredStore<LayeredStore<PersistentCompactIntMatrix>>
|
||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
// pass 1 — progressive: PartitionedDataStore::sum()
|
||||
// calls LayeredDataStore::sum() per partition (parallel)
|
||||
// calls PersistentCompactIntMatrix::sum() per layer (parallel)
|
||||
let global_sums = self.sum();
|
||||
|
||||
// pass 2 — per-partition partial using global_sums (parallel)
|
||||
let matrix = partitions
|
||||
.par_iter()
|
||||
.map(|p| p.partial_relfreq_bray(&global_sums))
|
||||
.reduce(element-wise +);
|
||||
// finalise
|
||||
for (i,j): dist[i,j] = 1 - matrix[i,j]
|
||||
let global = self.col_weights(); // pass 1 — progressive sum at every level
|
||||
let p = self.partial_relfreq_bray(&global); // pass 2 — global passed in cascade
|
||||
p.mapv(|v| 1.0 - v) // finalise (diagonal zeroed separately)
|
||||
}
|
||||
```
|
||||
|
||||
`global_sums` is exact because each kmer belongs to exactly one (partition, layer) pair — no double-counting. Pass 1 is itself fully parallel at every level of the hierarchy.
|
||||
`global` is exact: each kmer belongs to exactly one `(partition, layer)` pair, so there is no double-counting across the hierarchy.
|
||||
|
||||
---
|
||||
|
||||
@@ -277,11 +254,13 @@ fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
|
||||
| Level | Unit | Coordination |
|
||||
|---|---|---|
|
||||
| Across partitions | `LayeredDataStore` | none — fully independent |
|
||||
| Across layers (self-contained) | `(partition, layer)` pair | none — disjoint kmer sets |
|
||||
| Across layers (normalised, pass 1) | `(partition, layer)` pair | none — sums are additive |
|
||||
| Across layers (normalised, pass 2) | `(partition, layer)` pair | global_sums broadcast read-only |
|
||||
| Within a DataStore (distance matrix) | upper-triangle pair `(i,j)` | none — rayon par_iter |
|
||||
| Across partitions | `LayeredStore<LayeredStore<S>>` inner stores | none — fully independent |
|
||||
| Across layers within a partition | `LayeredStore<S>` inner stores | none — disjoint kmer sets |
|
||||
| Normalised pass 1 (`col_weights`) | per inner store | none — additive |
|
||||
| Normalised pass 2 (partial) | per inner store | `global` broadcast read-only |
|
||||
| Within a matrix (distance) | upper-triangle pair `(i,j)` | none — rayon `par_iter` |
|
||||
|
||||
All levels use rayon `par_iter` internally; `reduce_with` performs a parallel tree reduction.
|
||||
|
||||
---
|
||||
|
||||
@@ -331,16 +310,20 @@ Other derivations: threshold a count matrix → binary presence matrix; union tw
|
||||
|
||||
## Relationship to current implementation
|
||||
|
||||
The current `obilayeredmap` crate implements a subset of this architecture. Key divergences:
|
||||
### What is implemented
|
||||
|
||||
- `Layer<D: LayerData>` fuses `MphfLayer` and one `DataStore` into a single generic type. Multiple data stores on the same MPHF are not supported.
|
||||
- `LayerData::open(dir)` embeds the path convention (`counts/`, `presence/`) inside the store type, preventing the `PartitionedIndex` from managing paths externally.
|
||||
- `LayeredDataStore` and `PartitionedDataStore` do not yet exist; `LayeredMap` is a single-partition structure without a distance matrix API.
|
||||
- The partial distance methods exist on `PersistentCompactIntMatrix` and `PersistentBitMatrix` and are tested; they are not yet composed across layers and partitions.
|
||||
- **`obicompactvec::traits`**: `ColumnWeights`, `CountPartials`, `BitPartials` are defined and implemented on `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
|
||||
- **`obilayeredmap::LayeredStore<S>`**: generic wrapper with blanket impls for all three traits. `LayeredStore<LayeredStore<S>>` is the partitioned level — no separate type needed. Tests confirm that splitting data across layers and across partitions gives the same distance matrices as computing on flat combined data.
|
||||
|
||||
### What is not yet implemented
|
||||
|
||||
- `Layer<D: LayerData>` still fuses `MphfLayer` and one `DataStore`. Multiple data stores on the same MPHF are not supported.
|
||||
- `LayeredMap` is a single-partition structure without distance matrix API; it does not yet use `LayeredStore`.
|
||||
- No `PartitionedIndex` type for point queries with parallel partition dispatch.
|
||||
|
||||
### Planned refactoring
|
||||
|
||||
Planned refactoring:
|
||||
1. Extract `MphfLayer` from `Layer<D>` as an autonomous type.
|
||||
2. Replace `LayerData` trait with `DataStore` trait (no path knowledge).
|
||||
3. Implement `LayeredCompactIntMatrix` / `LayeredBitMatrix` with the partial + full distance APIs described above.
|
||||
4. Implement `PartitionedCompactIntMatrix` / `PartitionedBitMatrix` with two-pass support for normalised metrics.
|
||||
5. Implement `PartitionedIndex` for point queries with parallel dispatch.
|
||||
2. Replace `LayerData` trait with the `DataStore` / `ColumnWeights` / `CountPartials` / `BitPartials` system.
|
||||
3. Rewire `LayeredMap` to hold `LayeredStore<PersistentCompactIntMatrix>` (or bit variant) alongside the MPHF layers.
|
||||
4. Implement `PartitionedIndex` using `LayeredStore<LayeredStore<S>>` for data and parallel dispatch for queries.
|
||||
|
||||
Generated
+1
@@ -1788,6 +1788,7 @@ dependencies = [
|
||||
"cacheline-ef",
|
||||
"epserde 0.8.0",
|
||||
"memmap2",
|
||||
"ndarray",
|
||||
"obicompactvec",
|
||||
"obikseq",
|
||||
"obiskio",
|
||||
|
||||
@@ -117,6 +117,23 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{BitPartials, ColumnWeights};
|
||||
|
||||
impl ColumnWeights for PersistentBitMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.count_ones() }
|
||||
}
|
||||
|
||||
impl BitPartials for PersistentBitMatrix {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_jaccard_dist_matrix()
|
||||
}
|
||||
fn partial_hamming(&self) -> Array2<u64> {
|
||||
self.partial_hamming_dist_matrix()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentBitMatrixBuilder {
|
||||
|
||||
@@ -203,6 +203,35 @@ where
|
||||
m
|
||||
}
|
||||
|
||||
// ── Trait impls ───────────────────────────────────────────────────────────────
|
||||
|
||||
use crate::traits::{ColumnWeights, CountPartials};
|
||||
|
||||
impl ColumnWeights for PersistentCompactIntMatrix {
|
||||
fn col_weights(&self) -> Array1<u64> { self.sum() }
|
||||
}
|
||||
|
||||
impl CountPartials for PersistentCompactIntMatrix {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.partial_bray_dist_matrix()
|
||||
}
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.partial_euclidean_dist_matrix()
|
||||
}
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.partial_threshold_jaccard_dist_matrix(threshold)
|
||||
}
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_bray_dist_matrix(global)
|
||||
}
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_relfreq_euclidean_dist_matrix(global)
|
||||
}
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.partial_hellinger_euclidean_dist_matrix(global)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Builder ───────────────────────────────────────────────────────────────────
|
||||
|
||||
pub struct PersistentCompactIntMatrixBuilder {
|
||||
|
||||
@@ -5,12 +5,14 @@ mod format;
|
||||
mod intmatrix;
|
||||
mod meta;
|
||||
mod reader;
|
||||
pub mod traits;
|
||||
|
||||
pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
|
||||
pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder};
|
||||
pub use builder::PersistentCompactIntVecBuilder;
|
||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
|
||||
pub use reader::PersistentCompactIntVec;
|
||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "tests/mod.rs"]
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
|
||||
/// Column-level weight statistic — total count or presence count per column.
|
||||
/// Additive across layers and partitions; used as denominator in normalised distances.
|
||||
pub trait ColumnWeights: Send + Sync {
|
||||
fn col_weights(&self) -> Array1<u64>;
|
||||
}
|
||||
|
||||
/// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
|
||||
///
|
||||
/// Every `partial_*` method returns an additive component: element-wise summing the results
|
||||
/// across layers then across partitions yields the global partial, from which the final
|
||||
/// distance is computed via the corresponding provided method.
|
||||
///
|
||||
/// Normalised methods (`partial_relfreq_*`, `partial_hellinger`) require the **global**
|
||||
/// `col_weights` (summed across all layers and partitions) as a parameter. The provided
|
||||
/// finalisation methods compute this in a first pass via `self.col_weights()`.
|
||||
pub trait CountPartials: ColumnWeights {
|
||||
fn partial_bray(&self) -> Array2<u64>;
|
||||
fn partial_euclidean(&self) -> Array2<f64>;
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let sum_min = self.partial_bray();
|
||||
let w = self.col_weights();
|
||||
let n = w.len();
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let d = w[i] + w[j];
|
||||
m[[i, j]] = if d == 0 { 0.0 }
|
||||
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / d as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
self.partial_euclidean().mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_threshold_jaccard(threshold);
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let mut m = self.partial_relfreq_bray(&global).mapv(|v| 1.0 - v);
|
||||
let n = m.shape()[0];
|
||||
for i in 0..n { m[[i, i]] = 0.0; }
|
||||
m
|
||||
}
|
||||
|
||||
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
self.partial_relfreq_euclidean(&global).mapv(|v| v.sqrt())
|
||||
}
|
||||
|
||||
fn hellinger_dist_matrix(&self) -> Array2<f64> {
|
||||
let global = self.col_weights();
|
||||
let sq2 = std::f64::consts::SQRT_2;
|
||||
self.partial_hellinger(&global).mapv(|v| v.sqrt() / sq2)
|
||||
}
|
||||
}
|
||||
|
||||
/// Partial distance matrices for bit-based data (`PersistentBitMatrix`).
|
||||
///
|
||||
/// Both `partial_*` methods are additively decomposable across layers and partitions.
|
||||
pub trait BitPartials: ColumnWeights {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>);
|
||||
fn partial_hamming(&self) -> Array2<u64>;
|
||||
|
||||
// ── Provided finalisation methods ─────────────────────────────────────────
|
||||
|
||||
fn jaccard_dist_matrix(&self) -> Array2<f64> {
|
||||
let (inter, union) = self.partial_jaccard();
|
||||
let n = inter.shape()[0];
|
||||
let mut m = Array2::<f64>::zeros((n, n));
|
||||
for i in 0..n {
|
||||
for j in 0..n {
|
||||
if i != j {
|
||||
let u = union[[i, j]];
|
||||
m[[i, j]] = if u == 0 { 0.0 }
|
||||
else { 1.0 - inter[[i, j]] as f64 / u as f64 };
|
||||
}
|
||||
}
|
||||
}
|
||||
m
|
||||
}
|
||||
|
||||
fn hamming_dist_matrix(&self) -> Array2<u64> {
|
||||
self.partial_hamming()
|
||||
}
|
||||
}
|
||||
@@ -11,6 +11,7 @@ ptr_hash = "1.1"
|
||||
cacheline-ef = "1.1"
|
||||
epserde = "0.8"
|
||||
rayon = "1"
|
||||
ndarray = "0.16"
|
||||
memmap2 = "0.9"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
|
||||
@@ -0,0 +1,257 @@
|
||||
use ndarray::{Array1, Array2};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use obicompactvec::traits::{BitPartials, ColumnWeights, CountPartials};
|
||||
|
||||
/// A store that aggregates a `Vec<S>` — one entry per layer (within a partition)
|
||||
/// or one entry per partition.
|
||||
///
|
||||
/// Blanket impls of `ColumnWeights`, `CountPartials`, and `BitPartials` propagate
|
||||
/// automatically: `LayeredStore<LayeredStore<S>>` implements the same traits as
|
||||
/// `LayeredStore<S>`, giving the partitioned level for free.
|
||||
pub struct LayeredStore<S>(pub Vec<S>);
|
||||
|
||||
impl<S> LayeredStore<S> {
|
||||
pub fn new(layers: Vec<S>) -> Self { Self(layers) }
|
||||
pub fn layers(&self) -> &[S] { &self.0 }
|
||||
pub fn n_layers(&self) -> usize { self.0.len() }
|
||||
pub fn is_empty(&self) -> bool { self.0.is_empty() }
|
||||
}
|
||||
|
||||
// ── ColumnWeights ─────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: ColumnWeights> ColumnWeights for LayeredStore<S> {
|
||||
fn col_weights(&self) -> Array1<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.col_weights())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap_or_else(|| Array1::zeros(0))
|
||||
}
|
||||
}
|
||||
|
||||
// ── CountPartials ─────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: CountPartials> CountPartials for LayeredStore<S> {
|
||||
fn partial_bray(&self) -> Array2<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_bray())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_euclidean(&self) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_euclidean())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_threshold_jaccard(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_threshold_jaccard(threshold))
|
||||
.reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_relfreq_bray(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_relfreq_bray(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_relfreq_euclidean(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_relfreq_euclidean(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_hellinger(&self, global: &Array1<u64>) -> Array2<f64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_hellinger(global))
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// ── BitPartials ───────────────────────────────────────────────────────────────
|
||||
|
||||
impl<S: BitPartials> BitPartials for LayeredStore<S> {
|
||||
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>) {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_jaccard())
|
||||
.reduce_with(|(ai, au), (bi, bu)| (ai + bi, au + bu))
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn partial_hamming(&self) -> Array2<u64> {
|
||||
self.0.par_iter()
|
||||
.map(|s| s.partial_hamming())
|
||||
.reduce_with(|a, b| a + b)
|
||||
.unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use obicompactvec::{
|
||||
PersistentBitMatrix, PersistentBitMatrixBuilder,
|
||||
PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentCompactIntMatrixBuilder::new(n, dir.path()).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
|
||||
let n = cols.first().map_or(0, |c| c.len());
|
||||
let dir = tempdir().unwrap();
|
||||
let mut b = PersistentBitMatrixBuilder::new(n, dir.path()).unwrap();
|
||||
for &col in cols {
|
||||
let mut cb = b.add_col().unwrap();
|
||||
for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
|
||||
cb.close().unwrap();
|
||||
}
|
||||
b.close().unwrap();
|
||||
let m = PersistentBitMatrix::open(dir.path()).unwrap();
|
||||
(dir, m)
|
||||
}
|
||||
|
||||
// ── ColumnWeights ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn col_weights_sums_across_layers() {
|
||||
// layer 0: col0=[1,2], col1=[3,4] → weights [3, 7]
|
||||
// layer 1: col0=[10,0], col1=[0,10] → weights [10, 10]
|
||||
// combined: [13, 17]
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[3, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[10, 0], &[0, 10]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
let w = store.col_weights();
|
||||
assert_eq!(w[0], 13);
|
||||
assert_eq!(w[1], 17);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn col_weights_bit_sums_across_layers() {
|
||||
// layer 0: col0=[T,F,T], col1=[F,T,T] → counts [2, 2]
|
||||
// layer 1: col0=[F,F,T], col1=[T,T,F] → counts [1, 2]
|
||||
// combined: [3, 4]
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false, true], &[false, true, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[false, false, true], &[true, true, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
let w = store.col_weights();
|
||||
assert_eq!(w[0], 3);
|
||||
assert_eq!(w[1], 4);
|
||||
}
|
||||
|
||||
// ── CountPartials — layered (one partition) ───────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn layered_bray_matches_combined() {
|
||||
// Split [1,2,3,4,5] across two layers; bray dist should equal direct computation
|
||||
// on [1,2,3,4,5] for each column pair.
|
||||
// col0=[1,2,3,4,5], col1=[5,4,3,2,1]
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]); // slots 0-1
|
||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]); // slots 2-4
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
// direct on full data
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||
let got = CountPartials::bray_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "bray [0,1]");
|
||||
assert!((got[[1, 0]] - expected[[1, 0]]).abs() < 1e-12, "bray [1,0]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_relfreq_bray_matches_combined() {
|
||||
let (_d0, m0) = make_int_matrix(&[&[1, 2], &[5, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[3, 4, 5], &[3, 2, 1]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let expected = CountPartials::relfreq_bray_dist_matrix(&mf);
|
||||
let got = CountPartials::relfreq_bray_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "relfreq_bray [0,1]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_euclidean_matches_combined() {
|
||||
let (_d0, m0) = make_int_matrix(&[&[3, 0], &[0, 4]]);
|
||||
let (_d1, m1) = make_int_matrix(&[&[1, 1], &[2, 2]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[3, 0, 1, 1], &[0, 4, 2, 2]]);
|
||||
let expected = CountPartials::euclidean_dist_matrix(&mf);
|
||||
let got = CountPartials::euclidean_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "euclidean [0,1]");
|
||||
}
|
||||
|
||||
// ── CountPartials — partitioned (LayeredStore<LayeredStore<_>>) ───────────
|
||||
|
||||
#[test]
|
||||
fn partitioned_bray_matches_combined() {
|
||||
// partition 0: slots [1,2,3,4,5] col0 vs col1
|
||||
// partition 1: slots [10,20] col0 vs col1
|
||||
let (_d0, p0) = make_int_matrix(&[&[1, 2, 3, 4, 5], &[5, 4, 3, 2, 1]]);
|
||||
let (_d1, p1) = make_int_matrix(&[&[10, 20], &[20, 10]]);
|
||||
|
||||
let partitioned = LayeredStore::new(vec![
|
||||
LayeredStore::new(vec![p0]),
|
||||
LayeredStore::new(vec![p1]),
|
||||
]);
|
||||
|
||||
let (_df, mf) = make_int_matrix(&[&[1, 2, 3, 4, 5, 10, 20], &[5, 4, 3, 2, 1, 20, 10]]);
|
||||
let expected = CountPartials::bray_dist_matrix(&mf);
|
||||
let got = CountPartials::bray_dist_matrix(&partitioned);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "partitioned bray [0,1]");
|
||||
}
|
||||
|
||||
// ── BitPartials ───────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn layered_jaccard_matches_combined() {
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[true, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_bit_matrix(&[
|
||||
&[true, false, true, true],
|
||||
&[false, true, true, false],
|
||||
]);
|
||||
let expected = BitPartials::jaccard_dist_matrix(&mf);
|
||||
let got = BitPartials::jaccard_dist_matrix(&store);
|
||||
assert!((got[[0, 1]] - expected[[0, 1]]).abs() < 1e-12, "jaccard [0,1]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn layered_hamming_matches_combined() {
|
||||
let (_d0, m0) = make_bit_matrix(&[&[true, false], &[false, true]]);
|
||||
let (_d1, m1) = make_bit_matrix(&[&[true, true], &[false, false]]);
|
||||
let store = LayeredStore::new(vec![m0, m1]);
|
||||
|
||||
let (_df, mf) = make_bit_matrix(&[
|
||||
&[true, false, true, true],
|
||||
&[false, true, false, false],
|
||||
]);
|
||||
let expected = BitPartials::hamming_dist_matrix(&mf);
|
||||
let got = BitPartials::hamming_dist_matrix(&store);
|
||||
assert_eq!(got[[0, 1]], expected[[0, 1]], "hamming [0,1]");
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,11 @@
|
||||
pub mod error;
|
||||
pub mod evidence;
|
||||
pub mod layer;
|
||||
pub mod layered_store;
|
||||
pub mod map;
|
||||
pub mod meta;
|
||||
|
||||
pub use error::{OLMError, OLMResult};
|
||||
pub use layer::{Hit, Layer, LayerData};
|
||||
pub use layered_store::LayeredStore;
|
||||
pub use map::LayeredMap;
|
||||
|
||||
Reference in New Issue
Block a user