docs: document k-mer index architecture and refactor distance metrics

Add comprehensive documentation for the `obilayeredmap` crate, `PersistentCompactIntVec`, `PersistentBitVec`, and the hierarchical k-mer index architecture, including sidebar navigation updates across all documentation pages. Refactor the Bray-Curtis distance computation in `obicompactvec` to decouple numerator and denominator calculations, replacing direct pairwise calls with explicit loops over precomputed sums. Update tests to verify column sum accuracy and align with the simplified API.
This commit is contained in:
Eric Coissac
2026-05-15 21:07:23 +08:00
parent 8409c852ef
commit 45d49ed501
25 changed files with 8842 additions and 117 deletions
+112
View File
@@ -632,6 +632,90 @@
<li class="md-nav__item">
<a href="/implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -714,6 +798,34 @@
<li class="md-nav__item">
<a href="/architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
File diff suppressed because it is too large Load Diff
+115 -1
View File
@@ -9,9 +9,11 @@
<link rel="prev" href="../../../implementation/unitig_evidence/">
<link rel="prev" href="../../../implementation/persistent_bit_vec/">
<link rel="next" href="../../index_architecture/">
@@ -639,6 +641,90 @@
<li class="md-nav__item">
<a href="../../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -733,6 +819,34 @@
<li class="md-nav__item">
<a href="../../index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+48
View File
@@ -347,6 +347,42 @@
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../obilayeredmap/">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../persistent_compact_int_vec/">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../persistent_bit_vec/">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
@@ -385,6 +421,18 @@
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../../architecture/index_architecture/">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
+112
View File
@@ -745,6 +745,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -827,6 +911,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+316 -3
View File
@@ -745,6 +745,89 @@
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#multilayer-index-architecture" class="md-nav__link">
<span class="md-ellipsis">
Multilayer index architecture
</span>
</a>
<nav class="md-nav" aria-label="Multilayer index architecture">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#motivation" class="md-nav__link">
<span class="md-ellipsis">
Motivation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layer-structure" class="md-nav__link">
<span class="md-ellipsis">
Layer structure
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#membership-verification" class="md-nav__link">
<span class="md-ellipsis">
Membership verification
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#query-algorithm" class="md-nav__link">
<span class="md-ellipsis">
Query algorithm
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layer-count-and-probe-cost" class="md-nav__link">
<span class="md-ellipsis">
Layer count and probe cost
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#merging-layers" class="md-nav__link">
<span class="md-ellipsis">
Merging layers
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
@@ -795,6 +878,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -877,6 +1044,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
@@ -1002,6 +1197,89 @@
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#multilayer-index-architecture" class="md-nav__link">
<span class="md-ellipsis">
Multilayer index architecture
</span>
</a>
<nav class="md-nav" aria-label="Multilayer index architecture">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#motivation" class="md-nav__link">
<span class="md-ellipsis">
Motivation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layer-structure" class="md-nav__link">
<span class="md-ellipsis">
Layer structure
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#membership-verification" class="md-nav__link">
<span class="md-ellipsis">
Membership verification
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#query-algorithm" class="md-nav__link">
<span class="md-ellipsis">
Query algorithm
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layer-count-and-probe-cost" class="md-nav__link">
<span class="md-ellipsis">
Layer count and probe cost
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#merging-layers" class="md-nav__link">
<span class="md-ellipsis">
Merging layers
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
@@ -1071,7 +1349,7 @@
</ul>
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
<p><strong>Phase 1</strong> (provisional, discarded after spectrum computation): FMPHGO. Tolerates overestimated capacity, compact, no need to optimise for query speed on a temporary structure.</p>
<p><strong>Phase 2</strong> (persistent, queried repeatedly): open between FMPHGO and ptr_hash. Exact key count is available, so both operate optimally. ptr_hash's query speed advantage (2.13.3×) is meaningful for the persistent index but carries the risk of a very young crate. FMPHGO is the conservative default; ptr_hash is worth revisiting once it has broader production use.</p>
<p><strong>Phase 2</strong> (persistent, queried repeatedly): <strong>ptr_hash</strong>. Exact key count is available at phase 2, so ptr_hash operates optimally. Its query speed (≥2.1× over FMPHGO) and construction speed (≥3.1×) are meaningful for the persistent index; the space overhead at 2.4 bits/key is acceptable. The crate's youth (Feb 2025) was previously a concern; it is now accepted given the performance profile and the fact that each layer MPHF is independently rebuildable from its unitig file if needed.</p>
<p>boomphf is effectively eliminated: its space overhead is the largest and its streaming-construction advantage does not apply here.</p>
<hr />
<h2 id="space-at-scale">Space at scale</h2>
@@ -1106,12 +1384,47 @@
<h2 id="on-disk-and-mmap-considerations">On-disk and mmap considerations</h2>
<p>All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the <code>ph</code> crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 18 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.</p>
<p>No established Rust crate provides a natively on-disk MPHF. <strong>SSHash</strong> (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.</p>
<hr />
<h2 id="multilayer-index-architecture">Multilayer index architecture</h2>
<h3 id="motivation">Motivation</h3>
<p>An index built from a single dataset A can be extended with a new dataset B without rebuilding. This supports incremental construction (adding species, samples, or sequencing runs) and enables set operations across heterogeneous sources.</p>
<h3 id="layer-structure">Layer structure</h3>
<p>Each layer is a self-contained unit:</p>
<div class="highlight"><pre><span></span><code>layer_i/
unitigs.bin — packed 2-bit nucleotide sequences
mphf.bin — ptr_hash index (phase-2, exact key count)
evidence.bin — [(unitig_id, rank)] per MPHF slot (see unitig_evidence.md)
counts.bin — [u32] per MPHF slot
</code></pre></div>
<p>Layers are <strong>disjoint</strong>: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B proceeds as follows:</p>
<ol>
<li>For each kmer in B: query layer 0 — if found, accumulate count into <code>counts_0[MPHF_0(kmer)]</code>.</li>
<li>Collect all kmers of B not present in any existing layer → set <code>B \ A</code>.</li>
<li>Build layer 1 from <code>B \ A</code> using the standard two-phase pipeline (spectrum, filter, ptr_hash).</li>
</ol>
<p>Adding a third dataset C repeats the process: probe layer 0, then layer 1, then build layer 2 from <code>C \ A \ B</code>.</p>
<h3 id="membership-verification">Membership verification</h3>
<p>ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry: decode the kmer from <code>(unitig_id, rank)</code> and compare to the query. A mismatch means the kmer is absent from this layer; probe the next layer.</p>
<p>This makes the evidence layer load-bearing for correctness, not only for locality.</p>
<h3 id="query-algorithm">Query algorithm</h3>
<div class="highlight"><pre><span></span><code>fn query(kmer) → Option&lt;count&gt;:
for layer in layers:
slot = layer.mphf.query(kmer)
if layer.evidence.decode(slot) == kmer:
return Some(layer.counts[slot])
return None
</code></pre></div>
<p>Expected probe depth: 1 for kmers present in layer 0, increasing for rare kmers added in later layers. In practice, the dominant dataset (largest A) should be layer 0 to minimise average probe depth.</p>
<h3 id="layer-count-and-probe-cost">Layer count and probe cost</h3>
<p>Each probe is a ptr_hash lookup (~10 ns) plus one evidence decode (two array accesses). For L layers the worst case is L probes + 1 None. In practice L is small (25 for typical multi-species databases). No global data structure is needed to route queries; the layer chain is traversed in order.</p>
<h3 id="merging-layers">Merging layers</h3>
<p>Two layer chains can be merged by re-indexing their union through the standard pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.</p>
<h2 id="open-questions">Open questions</h2>
<ul>
<li>Confirm actual partition sizes and overestimation factor on representative metagenomic datasets.</li>
<li>Revisit ptr_hash for phase 2 once the crate has broader production track record.</li>
<li>Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary for the persistent index.</li>
<li><strong>rkyv integration</strong>: all flat arrays in a layer (evidence, counts, presence/absence matrix) map trivially to <code>rkyv::Archive</code> — fixed-size element types, no heap indirection. The presence/absence matrix is the strongest case: at 10 M kmers × 1 000 samples ≈ 1.25 GB per partition, zero-copy mmap via rkyv avoids loading the entire matrix at open time, letting the OS page cache serve only accessed pages. ptr_hash itself is internally a flat bit array and is structurally compatible with rkyv, but requires either native crate support or a wrapper. Assess the wrapper cost and whether ptr_hash is willing to adopt rkyv upstream.</li>
<li>Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.</li>
<li>Determine optimal layer ordering heuristic (by kmer count? by query frequency?) for multi-species databases.</li>
</ul>
File diff suppressed because it is too large Load Diff
+112
View File
@@ -795,6 +795,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -877,6 +961,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+112
View File
@@ -767,6 +767,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -849,6 +933,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -723,6 +723,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -805,6 +889,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -745,6 +745,90 @@
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -827,6 +911,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+150 -2
View File
@@ -6,7 +6,7 @@
<meta charset="utf-8"/>
<meta content="width=device-width,initial-scale=1" name="viewport"/>
<link href="../mphf/" rel="prev"/>
<link href="../../architecture/sequences/invariant/" rel="next"/>
<link href="../obilayeredmap/" rel="next"/>
<link href="../../assets/images/favicon.png" rel="icon"/>
<meta content="mkdocs-1.6.1, mkdocs-material-9.7.6" name="generator"/>
<title>Unitig evidence encoding - obikmer</title>
@@ -467,6 +467,37 @@
</nav>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#non-determinism-of-the-unitig-decomposition">
<span class="md-ellipsis">
Non-determinism of the unitig decomposition
</span>
</a>
<nav aria-label="Non-determinism of the unitig decomposition" class="md-nav">
<ul class="md-nav__list">
<li class="md-nav__item">
<a class="md-nav__link" href="#source-of-non-determinism">
<span class="md-ellipsis">
Source of non-determinism
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#consequence-for-mphf-construction">
<span class="md-ellipsis">
Consequence for MPHF construction
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#open-questions">
<span class="md-ellipsis">
@@ -478,6 +509,42 @@
</ul>
</nav>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../obilayeredmap/">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../persistent_compact_int_vec/">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../persistent_bit_vec/">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
</li>
@@ -513,6 +580,18 @@
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="../../architecture/index_architecture/">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
@@ -698,6 +777,37 @@
</nav>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#non-determinism-of-the-unitig-decomposition">
<span class="md-ellipsis">
Non-determinism of the unitig decomposition
</span>
</a>
<nav aria-label="Non-determinism of the unitig decomposition" class="md-nav">
<ul class="md-nav__list">
<li class="md-nav__item">
<a class="md-nav__link" href="#source-of-non-determinism">
<span class="md-ellipsis">
Source of non-determinism
</span>
</a>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#consequence-for-mphf-construction">
<span class="md-ellipsis">
Consequence for MPHF construction
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a class="md-nav__link" href="#open-questions">
<span class="md-ellipsis">
@@ -774,7 +884,8 @@ b_B = \left\lceil \log_2 U \right\rceil + \left\lceil \log_2 L_{max} \right\rcei
</tr>
</tbody>
</table>
<p>On <em>Betula nana</em> (k=31, 256 partitions), m_u ≈ 37.9 kmers/unitig on average; no unitig length distribution data measured yet. The <code>rank</code> field (kmer index within the unitig) fits in a <code>u8</code> as long as no unitig exceeds 255 kmers — guaranteed by the split strategy below.</p>
<p><strong>Structural maximum from superkmer construction.</strong> For k=31 and m=11, the maximum number of consecutive kmers sharing the same minimiser is k m + 1 = <strong>21 kmers</strong> (the minimiser traverses from position km to 0 as the window slides). A unitig that is a single full superkmer therefore has exactly 21 kmers. This is confirmed by a bimodal distribution in empirical data: a sharp peak at 21 kmers appears in all partitions, including the anomalous partition 145. The observed maximum is ~46 kmers (unitigs spanning more than one superkmer), well within u8 range.</p>
<p>On <em>Betula nana</em> (k=31, 256 partitions), m_u ≈ 37.9 kmers/unitig on average. The <code>rank</code> field (kmer index within the unitig) fits in a <code>u8</code> as long as no unitig exceeds 255 kmers — guaranteed by the split strategy below and amply satisfied by empirical maximums (~46 kmers observed).</p>
<h3 id="split-strategy-for-long-unitigs">Split strategy for long unitigs</h3>
<p>For the rare cases where a unitig exceeds 255 kmers, the unitig is split into chunks of at most 255 kmers, with a <strong>k1 nucleotide overlap</strong> at each junction — identical to the way super-kmers are delimited at partition boundaries. Each chunk is self-contained and independently decodable.</p>
<div class="highlight"><pre><span></span><code>original unitig: kmer_0 … kmer_254 | kmer_255 … kmer_N
@@ -1026,6 +1137,43 @@ kmer = nucleotides(unitig_id)[rank .. rank + k] // 2-bit packed slice
<h3 id="forward-vs-reverse-complement">Forward vs reverse complement</h3>
<p>The De Bruijn graph stores only canonical kmers. The evidence encodes the canonical orientation. Callers that need the strand of the original kmer must compare the retrieved kmer with its revcomp at query time; this is a single 64-bit comparison.</p>
<hr/>
<h2 id="non-determinism-of-the-unitig-decomposition">Non-determinism of the unitig decomposition</h2>
<p>The unitig extraction is <strong>not deterministic</strong>: two runs on identical input can produce a different number of unitigs with different sequences, while covering exactly the same canonical k-mer set.</p>
<h3 id="source-of-non-determinism">Source of non-determinism</h3>
<p>The graph nodes are stored in a hash map whose iteration order depends on the hash seed (random per run with <code>ahash::RandomState::new()</code>). The <code>start_iter</code> first pass emits every node whose <code>can_extend_left</code> flag is false — which includes not only true dead-end nodes but also <strong>branch points</strong> (nodes with 2 or more left neighbours, for which <code>unique_neighbor</code> returns <code>None</code>).</p>
<p>When a branch point is encountered before its upstream neighbours, it claims the downstream chain and those neighbours later produce length-k degenerate unitigs. When upstream neighbours are encountered first, they extend through the branch point and consume it.</p>
<p><strong>Example</strong> — fork topology (k = 31):</p>
<div class="highlight"><pre><span></span><code>A → B ← C
D
</code></pre></div>
<p>All four nodes are in the graph. B has two left neighbours (A and C), so <code>can_extend_left = false</code>; B also has one right neighbour D, so <code>can_extend_right = true</code>.</p>
<table>
<thead>
<tr>
<th>iteration order</th>
<th>unitigs produced</th>
<th>count</th>
</tr>
</thead>
<tbody>
<tr>
<td>A first, then B, C</td>
<td>ABD · C</td>
<td>2</td>
</tr>
<tr>
<td>B first, then A, C</td>
<td>BD · A · C</td>
<td>3</td>
</tr>
</tbody>
</table>
<p>Both tilings cover the same 4 canonical k-mers.</p>
<p>Pure cycles (all nodes have both extensions present) are unaffected by this: they are never emitted in the first pass and each cycle produces exactly one unitig regardless of which node the second pass starts from. Only the cycle cut point (and therefore the sequence content) varies.</p>
<h3 id="consequence-for-mphf-construction">Consequence for MPHF construction</h3>
<p>The MPHF is built from the <strong>k-mer set</strong>, not from the unitig sequences themselves. Because both tilings contain the same canonical k-mers, the resulting MPHF is identical. The non-determinism is benign for this use case.</p>
<hr/>
<h2 id="open-questions">Open questions</h2>
<ul>
<li><strong>Rank field width</strong>: u8 covers 255 kmers; storing lengths and ranks in kmer units (not nucleotides) buys k1 extra units of headroom at no cost. On <em>B. nana</em> (k=31), m_u ≈ 38 — well within u8 range on average, but the maximum unitig length has not been measured yet. For genomes with very long unitigs, u16 may be needed; the header could record the actual width if portability is required.</li>
+112
View File
@@ -708,6 +708,90 @@
<li class="md-nav__item">
<a href="implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -790,6 +874,34 @@
<li class="md-nav__item">
<a href="architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -740,6 +740,90 @@
<li class="md-nav__item">
<a href="../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -822,6 +906,34 @@
<li class="md-nav__item">
<a href="../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
Binary file not shown.
+112
View File
@@ -712,6 +712,90 @@
<li class="md-nav__item">
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -794,6 +878,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -767,6 +767,90 @@
<li class="md-nav__item">
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -849,6 +933,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -712,6 +712,90 @@
<li class="md-nav__item">
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -794,6 +878,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+112
View File
@@ -756,6 +756,90 @@
<li class="md-nav__item">
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
</ul>
</nav>
@@ -838,6 +922,34 @@
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
+235 -68
View File
@@ -58,12 +58,230 @@ trait DataStore {
Concrete types from `obicompactvec`:
| Type | `Item` | Use |
|---|---|---|
| `PersistentCompactIntMatrix` | `Box<[u32]>` | count per sample per slot |
| `PersistentBitMatrix` | `Box<[bool]>` | presence per sample per slot |
| Type | `Item` | Column stats | Use |
|---|---|---|---|
| `PersistentCompactIntMatrix` | `Box<[u32]>` | `sum() -> Array1<u64>` | count per sample per slot |
| `PersistentBitMatrix` | `Box<[bool]>` | `count_ones() -> Array1<u64>` | presence per sample per slot |
A `DataStore` knows nothing about kmers or MPHFs. It is indexed by `usize` slot only. The path to its on-disk files is managed by the `LayeredPartition`, not embedded in the store type.
`sum()` and `count_ones()` are the bridge between the per-matrix level and cross-layer aggregation: they give the total weight of each column within one (partition, layer) pair, which can be summed to get global column weights.
A `DataStore` knows nothing about kmers or MPHFs. It is indexed by `usize` slot only.
---
## Distance matrix API on DataStore types
Both `PersistentCompactIntMatrix` and `PersistentBitMatrix` expose two families of distance matrix methods.
### Full distance matrices
Compute the final `n_cols × n_cols` distance matrix from data within a single matrix. Internally parallelised over the upper triangle via rayon.
```rust
// PersistentCompactIntMatrix
fn bray_dist_matrix(&self) -> Array2<f64>
fn relfreq_bray_dist_matrix(&self) -> Array2<f64>
fn euclidean_dist_matrix(&self) -> Array2<f64>
fn relfreq_euclidean_dist_matrix(&self) -> Array2<f64>
fn hellinger_dist_matrix(&self) -> Array2<f64>
fn jaccard_dist_matrix(&self) -> Array2<f64>
fn threshold_jaccard_dist_matrix(&self, threshold: u32) -> Array2<f64>
// PersistentBitMatrix
fn jaccard_dist_matrix(&self) -> Array2<f64>
fn hamming_dist_matrix(&self) -> Array2<u64>
```
These are convenience methods. For a `LayeredDataStore` or `PartitionedDataStore` they cannot be used directly — the partial API is required.
### Partial distance matrices
Return additive components that can be summed element-wise across (partition, layer) pairs before computing the final distance. This is what makes cross-layer and cross-partition aggregation possible.
**Category 1 — self-contained partials**: additive without any external parameter.
```rust
// PersistentCompactIntMatrix
fn partial_bray_dist_matrix(&self)
-> (Array2<u64>, // sum_min[i,j]
Array1<u64>) // col_sums[k]
fn partial_euclidean_dist_matrix(&self) -> Array2<f64> // sum of squared diffs
fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32)
-> (Array2<u64>, // inter[i,j]
Array2<u64>) // union[i,j]
// PersistentBitMatrix
fn partial_jaccard_dist_matrix(&self)
-> (Array2<u64>, // inter[i,j]
Array2<u64>) // union[i,j]
fn partial_hamming_dist_matrix(&self) -> Array2<u64> // differing bits
```
**Category 2 — normalised partials**: require global column sums as input, computed beforehand across all (partition, layer) pairs.
```rust
// PersistentCompactIntMatrix only
fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>)
-> Array2<f64> // Σ_slot min(a_slot/sum_i, b_slot/sum_j)
fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>)
-> Array2<f64> // Σ_slot (a_slot/sum_i - b_slot/sum_j)²
fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>)
-> Array2<f64> // Σ_slot (√(a/sum_i) - √(b/sum_j))²
```
The `col_sums` parameter must reflect the GLOBAL count across all layers and all partitions — passing a per-layer sum would give a wrong result. This constraint drives the two-pass algorithm described below.
---
## Progressive aggregation principle
Aggregation is **hierarchical**: each level computes its contribution by aggregating from the level immediately below it. No level skips a level or collects raw data from two levels down.
```
PersistentCompactIntMatrix::sum() — column sums for one (partition, layer) matrix
↓ Σ across layers
LayeredCompactIntMatrix::sum() — column sums for one partition
↓ Σ across partitions
PartitionedCompactIntMatrix::sum() — global column sums
```
The same cascade applies to every partial computation:
```
PersistentCompactIntMatrix::partial_bray_dist_matrix() — one (partition, layer)
↓ element-wise Σ across layers
LayeredCompactIntMatrix::partial_bray() — one partition
↓ element-wise Σ across partitions
PartitionedCompactIntMatrix::partial_bray() — global partial → final dist
```
This means `LayeredCompactIntMatrix` never inspects individual `PersistentCompactIntVec` columns directly, and `PartitionedCompactIntMatrix` never inspects individual layers. Each level presents a stable API surface to the level above.
---
## LayeredDataStore — aggregation within one partition
A `LayeredDataStore` holds one `DataStore` per layer within a single partition:
```rust
struct LayeredCompactIntMatrix { layers: Vec<PersistentCompactIntMatrix> }
struct LayeredBitMatrix { layers: Vec<PersistentBitMatrix> }
```
### Column statistics
```rust
// LayeredCompactIntMatrix
fn sum(&self) -> Array1<u64>
// = layers.par_iter().map(|m| m.sum()).reduce(element-wise +)
// LayeredBitMatrix
fn count_ones(&self) -> Array1<u64>
// = layers.par_iter().map(|m| m.count_ones()).reduce(element-wise +)
```
### Self-contained partials
Each method reduces across layers by element-wise addition of per-layer matrices:
```rust
fn partial_bray(&self) -> (Array2<u64>, Array1<u64>)
// Σ_l layer_l.partial_bray_dist_matrix()
fn partial_euclidean(&self) -> Array2<f64>
// Σ_l layer_l.partial_euclidean_dist_matrix()
fn partial_jaccard(&self) -> (Array2<u64>, Array2<u64>)
// Σ_l layer_l.partial_jaccard_dist_matrix() [bit matrix]
// Σ_l layer_l.partial_threshold_jaccard_dist_matrix() [int matrix]
fn partial_hamming(&self) -> Array2<u64>
// Σ_l layer_l.partial_hamming_dist_matrix() [bit matrix]
```
### Normalised partials (require global sums from above)
```rust
fn partial_relfreq_bray(&self, global_sums: &Array1<u64>) -> Array2<f64>
// Σ_l layer_l.partial_relfreq_bray_dist_matrix(global_sums)
fn partial_relfreq_euclidean(&self, global_sums: &Array1<u64>) -> Array2<f64>
// Σ_l layer_l.partial_relfreq_euclidean_dist_matrix(global_sums)
fn partial_hellinger(&self, global_sums: &Array1<u64>) -> Array2<f64>
// Σ_l layer_l.partial_hellinger_euclidean_dist_matrix(global_sums)
```
`global_sums` is provided by the `PartitionedDataStore`; this level does not compute it.
---
## PartitionedDataStore — aggregation across all partitions
A `PartitionedDataStore` holds one `LayeredDataStore` per partition:
```rust
struct PartitionedCompactIntMatrix { partitions: Vec<LayeredCompactIntMatrix> }
struct PartitionedBitMatrix { partitions: Vec<LayeredBitMatrix> }
```
### Column statistics
```rust
fn sum(&self) -> Array1<u64>
// = partitions.par_iter().map(|p| p.sum()).reduce(element-wise +)
```
`p.sum()` is itself a reduction across layers (see above) — the cascade is preserved.
### Self-contained metrics — single pass
```rust
fn bray_dist_matrix(&self) -> Array2<f64> {
let (sum_min, col_sums) = partitions
.par_iter()
.map(|p| p.partial_bray())
.reduce(element-wise +);
// finalise
for (i,j): dist[i,j] = 1 - 2·sum_min[i,j] / (col_sums[i] + col_sums[j])
}
```
### Normalised metrics — two passes
```rust
fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
// pass 1 — progressive: PartitionedDataStore::sum()
// calls LayeredDataStore::sum() per partition (parallel)
// calls PersistentCompactIntMatrix::sum() per layer (parallel)
let global_sums = self.sum();
// pass 2 — per-partition partial using global_sums (parallel)
let matrix = partitions
.par_iter()
.map(|p| p.partial_relfreq_bray(&global_sums))
.reduce(element-wise +);
// finalise
for (i,j): dist[i,j] = 1 - matrix[i,j]
}
```
`global_sums` is exact because each kmer belongs to exactly one (partition, layer) pair — no double-counting. Pass 1 is itself fully parallel at every level of the hierarchy.
---
## Parallelism model
| Level | Unit | Coordination |
|---|---|---|
| Across partitions | `LayeredDataStore` | none — fully independent |
| Across layers (self-contained) | `(partition, layer)` pair | none — disjoint kmer sets |
| Across layers (normalised, pass 1) | `(partition, layer)` pair | none — sums are additive |
| Across layers (normalised, pass 2) | `(partition, layer)` pair | global_sums broadcast read-only |
| Within a DataStore (distance matrix) | upper-triangle pair `(i,j)` | none — rayon par_iter |
---
@@ -80,65 +298,19 @@ for each layer l in p:
return None
```
O(n_layers) MPHF probes in the worst case; O(1) expected (kmer in layer 0). No cross-layer data fusion — the result comes from exactly one layer.
O(n_layers) MPHF probes worst case; O(1) expected. No cross-layer fusion — the result comes from exactly one (partition, layer).
### Sequence scan — `sequence → Vec<(kmer, Option<Item>)>`
Decompose into canonical kmers, group by partition, dispatch to each partition in parallel. Within a partition, probe layers in order per kmer. Collect results.
Parallelism: across partitions (independent). Within a partition: per-kmer probing is sequential across layers but different kmers are independent.
### Aggregation — `→ Accumulator`
For operations that traverse all kmers (distance, presence matrix, global counts):
### Aggregation — `→ Result`
```
result = reduce(
for p in partitions: // parallel
for l in layers(p): // parallel
for p in partitions: // parallel
for l in layers(p): // parallel
partial(DataStore_p_l)
)
```
Each `(partition, layer)` contributes an independent `Partial`. Global result = `reduce(all partials)`.
---
## Aggregator pattern
```rust
trait Aggregator<D: DataStore> {
type Partial: Send;
type Result;
fn partial(&self, store: &D) -> Self::Partial;
fn reduce(&self, parts: impl Iterator<Item=Self::Partial>) -> Self::Result;
}
```
Concrete aggregators:
| Aggregator | `Partial` | `Result` |
|---|---|---|
| `BrayCurtis(i, j)` | `(sum_min, sum_a, sum_b): (u64, u64, u64)` | `f64` |
| `Jaccard(i, j)` | `(inter, union): (u64, u64)` | `f64` |
| `Hellinger(i, j)` | `(sum_sqrt_prod, sum_a, sum_b): (f64, f64, f64)` | `f64` |
| `DistanceMatrix(metric)` | `n×n partial matrix` | `n×n f64 matrix` |
| `PresenceQuery(kmer)` | — | routed to point query |
The `partial` for `BrayCurtis(i, j)` on a `PersistentCompactIntMatrix` with columns i and j already exists as `PersistentCompactIntVec::partial_bray_dist` — it needs to be lifted to the column-pair level on the matrix.
---
## Parallelism model
| Level | Unit | Coordination |
|---|---|---|
| Across partitions | `LayeredPartition` | none — fully independent |
| Across layers (aggregation) | `(partition, layer)` pair | none — disjoint kmer sets |
| Within a layer (point query) | n/a — single layer per kmer | n/a |
| DataStore derivation | one `(partition, layer)` per task | none |
The dispatch model: `PartitionedIndex::aggregate(aggregator)` fans out over partitions (rayon `par_iter`), each partition fans out over its layers, collects partials, then a top-level `reduce` combines.
For normalised metrics replace with the two-pass scheme above.
---
@@ -149,17 +321,11 @@ Because the `MphfLayer` is independent of its data stores, new stores can be der
```
// count → presence/absence, parallel across (partition, layer)
for (p, l) in all_partition_layer_pairs().par_iter():
count_store = open PersistentCompactIntMatrix at (p, l)
count_store = open PersistentCompactIntMatrix at (p, l)
presence_store = PersistentBitMatrix::from_count_matrix(count_store, threshold, dir)
attach presence_store to MphfLayer(p, l)
```
Other derivations:
- Threshold a count matrix → binary presence matrix
- Union two presence matrices (same MPHF, different samples)
- Merge two count matrices (saturating add, column-wise)
All derivations are local to a `(partition, layer)` pair and fully parallelisable.
Other derivations: threshold a count matrix → binary presence matrix; union two presence matrices; merge two count matrices (saturating add, column-wise). All are local to one `(partition, layer)` pair.
---
@@ -169,11 +335,12 @@ The current `obilayeredmap` crate implements a subset of this architecture. Key
- `Layer<D: LayerData>` fuses `MphfLayer` and one `DataStore` into a single generic type. Multiple data stores on the same MPHF are not supported.
- `LayerData::open(dir)` embeds the path convention (`counts/`, `presence/`) inside the store type, preventing the `PartitionedIndex` from managing paths externally.
- The `Aggregator` pattern is not yet implemented; partial distance methods exist on `PersistentCompactIntVec` but are not composed across layers and partitions.
- No `PartitionedIndex` type exists; `LayeredMap` is a single-partition structure.
- `LayeredDataStore` and `PartitionedDataStore` do not yet exist; `LayeredMap` is a single-partition structure without a distance matrix API.
- The partial distance methods exist on `PersistentCompactIntMatrix` and `PersistentBitMatrix` and are tested; they are not yet composed across layers and partitions.
Planned refactoring:
1. Extract `MphfLayer` from `Layer<D>` as an autonomous type.
2. Replace `LayerData` trait with `DataStore` trait (no path knowledge).
3. Implement `LayeredPartition` that holds `Vec<MphfLayer>` and attaches data stores externally.
4. Implement `PartitionedIndex` with parallel dispatch and the `Aggregator` pattern.
3. Implement `LayeredCompactIntMatrix` / `LayeredBitMatrix` with the partial + full distance APIs described above.
4. Implement `PartitionedCompactIntMatrix` / `PartitionedBitMatrix` with two-pass support for normalised metrics.
5. Implement `PartitionedIndex` for point queries with parallel dispatch.
+19 -18
View File
@@ -36,7 +36,20 @@ impl PersistentCompactIntMatrix {
// ── Distance matrices ─────────────────────────────────────────────────────
pub fn bray_dist_matrix(&self) -> Array2<f64> {
self.pairwise(|i, j| self.col(i).bray_dist(self.col(j)))
let sum_min = self.partial_bray_dist_matrix();
let col_sums = self.sum();
let n = self.n_cols();
let mut m = Array2::zeros((n, n));
for i in 0..n {
for j in 0..n {
if i != j {
let denom = col_sums[i] + col_sums[j];
m[[i, j]] = if denom == 0 { 0.0 }
else { 1.0 - 2.0 * sum_min[[i, j]] as f64 / denom as f64 };
}
}
}
m
}
pub fn relfreq_bray_dist_matrix(&self) -> Array2<f64> {
@@ -74,23 +87,11 @@ impl PersistentCompactIntMatrix {
// ── Partial matrices (additively decomposable across layers) ──────────────
/// Returns `(sum_min[n×n], col_sums[n])`.
/// `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// `col_sums[k]` = Σ_slot col_k[slot].
/// Reduce across layers by element-wise addition before computing the final distance.
pub fn partial_bray_dist_matrix(&self) -> (Array2<u64>, Array1<u64>) {
let n = self.n_cols();
let col_sums: Vec<u64> = (0..n)
.into_par_iter()
.map(|i| self.col(i).sum())
.collect();
let sum_min = self.pairwise_u64(|i, j| {
self.col(i).partial_bray_dist(self.col(j)).0
});
(sum_min, Array1::from_vec(col_sums))
/// Returns `sum_min[n×n]` where `sum_min[i,j]` = Σ_slot min(col_i[slot], col_j[slot]).
/// The denominator `col_sums[i] + col_sums[j]` is obtained from `self.sum()`.
/// Additive across layers by element-wise addition.
pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
}
/// Returns sum of squared differences `[n×n]`.
+8 -18
View File
@@ -141,32 +141,22 @@ impl PersistentCompactIntVec {
#[inline]
/// Returns the Bray-Curtis distance between two compact int vectors.
pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
let (sum_min, denom) = self.partial_bray_dist(other);
let sum_min = self.partial_bray_dist(other);
let denom = self.sum() + other.sum();
if denom == 0 {
return 0.0;
}
1.0 - 2.0 * sum_min as f64 / denom as f64
}
/// Returns the partial Bray-Curtis distance between two compact int vectors.
///
/// Returns a tuple `(sum_min, denom)` where `sum_min` is the sum of the minimum values
/// at each index, and `denom` is the sum of the values in both vectors.
/// This is used internally by [`bray_dist`] and to easily compute the Bray-Curtis distance
/// over a set of vector pairs.
///
/// Returns the tuple `(sum_min, sum_a + sum_b)` where `sum_min` is the sum of the minimum
/// values at each index, `sum_a` is the sum of the first vector's counts, and `sum_b` is
/// the sum of the second vector's counts.
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> (u64, u64) {
/// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
/// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
assert_eq!(self.n, other.len(), "length mismatch");
let (sum_min, sum_a, sum_b) = self
.iter()
self.iter()
.zip(other.iter())
.fold((0u64, 0u64, 0u64), |(sm, sa, sb), (a, b)| {
(sm + a.min(b) as u64, sa + a as u64, sb + b as u64)
});
(sum_min, sum_a + sum_b)
.map(|(a, b)| a.min(b) as u64)
.sum()
}
/// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
+3 -7
View File
@@ -126,21 +126,17 @@ fn jaccard_dist_matrix_values_match_pairwise() {
#[test]
fn partial_bray_dist_matrix_consistent() {
let (_d, m) = make_matrix(&[&[1, 0, 1], &[1, 1, 0], &[0, 1, 1]]);
let (sum_min, col_sums) = m.partial_bray_dist_matrix();
let sum_min = m.partial_bray_dist_matrix();
let col_sums = m.sum();
let n = m.n_cols();
// symmetry of sum_min
// symmetry
for i in 0..n {
for j in 0..n {
assert_eq!(sum_min[[i, j]], sum_min[[j, i]]);
}
}
// col_sums correct
for k in 0..n {
assert_eq!(col_sums[k], m.col(k).sum());
}
// reconstruct distance from partials and compare to direct method
for i in 0..n {
for j in i + 1..n {