refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
+3
-3
@@ -2,10 +2,10 @@
|
||||
src/target
|
||||
data-stress
|
||||
*.fasta
|
||||
*.fasta.gz
|
||||
*.zst
|
||||
*.zst.meta
|
||||
*.pb
|
||||
*.json
|
||||
./**/*.json
|
||||
*.bin
|
||||
*.bin
|
||||
*.json
|
||||
Betula_exilis--IGA-24-33
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
// Project tasks configuration. See https://zed.dev/docs/tasks for documentation.
|
||||
//
|
||||
// Example:
|
||||
[
|
||||
{
|
||||
"label": "Example task",
|
||||
"command": "for i in {1..5}; do echo \"Hello $i/5\"; sleep 1; done",
|
||||
//"args": [],
|
||||
// Env overrides for the command, will be appended to the terminal's environment from the settings.
|
||||
"env": { "foo": "bar" },
|
||||
// Current working directory to spawn the command into, defaults to current project root.
|
||||
//"cwd": "/path/to/working/directory",
|
||||
// Whether to use a new terminal tab or reuse the existing one to spawn the process, defaults to `false`.
|
||||
"use_new_terminal": false,
|
||||
// Whether to allow multiple instances of the same task to be run, or rather wait for the existing ones to finish, defaults to `false`.
|
||||
"allow_concurrent_runs": false,
|
||||
// What to do with the terminal pane and tab, after the command was started:
|
||||
// * `always` — always show the task's pane, and focus the corresponding tab in it (default)
|
||||
// * `no_focus` — always show the task's pane, add the task's tab in it, but don't focus it
|
||||
// * `never` — do not alter focus, but still add/reuse the task's tab in its pane
|
||||
"reveal": "always",
|
||||
// Where to place the task's terminal item after starting the task:
|
||||
// * `dock` — in the terminal dock, "regular" terminal items' place (default)
|
||||
// * `center` — in the central pane group, "main" editor area
|
||||
"reveal_target": "dock",
|
||||
// What to do with the terminal pane and tab, after the command had finished:
|
||||
// * `never` — Do nothing when the command finishes (default)
|
||||
// * `always` — always hide the terminal tab, hide the pane also if it was the last tab in it
|
||||
// * `on_success` — hide the terminal tab on task success only, otherwise behaves similar to `always`
|
||||
"hide": "never",
|
||||
// Which shell to use when running a task inside the terminal.
|
||||
// May take 3 values:
|
||||
// 1. (default) Use the system's default terminal configuration in /etc/passwd
|
||||
// "shell": "system"
|
||||
// 2. A program:
|
||||
// "shell": {
|
||||
// "program": "sh"
|
||||
// }
|
||||
// 3. A program with arguments:
|
||||
// "shell": {
|
||||
// "with_arguments": {
|
||||
// "program": "/bin/bash",
|
||||
// "args": ["--login"]
|
||||
// }
|
||||
// }
|
||||
"shell": "system",
|
||||
// Whether to show the task line in the output of the spawned task, defaults to `true`.
|
||||
"show_summary": true,
|
||||
// Whether to show the command line in the output of the spawned task, defaults to `true`.
|
||||
"show_command": true,
|
||||
// Which edited buffers to save before running the task:
|
||||
// * `all` — save all edited buffers
|
||||
// * `current` — save currently active buffer only
|
||||
// * `none` — don't save any buffers
|
||||
"save": "none",
|
||||
// Represents the tags for inline runnable indicators, or spawning multiple tasks at once.
|
||||
// "tags": []
|
||||
},
|
||||
]
|
||||
@@ -1,6 +0,0 @@
|
||||
{
|
||||
"n_bits": 8,
|
||||
"kmer_size": 31,
|
||||
"minimizer_size": 11,
|
||||
"level": 3
|
||||
}
|
||||
+85
-1
@@ -221,7 +221,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/theory/kmers/" class="md-nav__link">
|
||||
<a href="/kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -304,6 +304,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -498,6 +526,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -548,6 +604,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="/implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../../../implementation/mphf/">
|
||||
<link rel="prev" href="../../../implementation/unitig_evidence/">
|
||||
|
||||
|
||||
|
||||
@@ -228,7 +228,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -311,6 +311,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -505,6 +533,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -555,6 +611,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../../implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -230,7 +230,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -313,6 +313,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -611,6 +639,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
@@ -661,6 +717,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
<link rel="prev" href="../storage/">
|
||||
|
||||
|
||||
<link rel="next" href="../../architecture/sequences/invariant/">
|
||||
<link rel="next" href="../unitig_evidence/">
|
||||
|
||||
|
||||
|
||||
@@ -64,7 +64,7 @@
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#mphf-selection-analysis-in-progress" class="md-skip">
|
||||
<a href="#mphf-selection-two-phase-indexing-architecture" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
@@ -230,7 +230,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -313,6 +313,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -509,6 +537,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
@@ -597,6 +653,56 @@
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#indexing-architecture" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Indexing architecture
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Indexing architecture">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#superkmer-vs-kmer-counts" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Superkmer vs kmer counts
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-1-provisional-index-and-spectrum" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Phase 1 — provisional index and spectrum
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-2-definitive-index" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Phase 2 — definitive index
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#candidates" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
@@ -606,6 +712,17 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#mphf-choice-per-phase" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
MPHF choice per phase
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -650,6 +767,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
@@ -765,6 +910,56 @@
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#indexing-architecture" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Indexing architecture
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Indexing architecture">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#superkmer-vs-kmer-counts" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Superkmer vs kmer counts
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-1-provisional-index-and-spectrum" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Phase 1 — provisional index and spectrum
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#phase-2-definitive-index" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Phase 2 — definitive index
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#candidates" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
@@ -774,6 +969,17 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#mphf-choice-per-phase" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
MPHF choice per phase
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -826,29 +1032,50 @@
|
||||
|
||||
|
||||
|
||||
<h1 id="mphf-selection-analysis-in-progress">MPHF selection — analysis in progress</h1>
|
||||
<p>The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated.</p>
|
||||
<h1 id="mphf-selection-two-phase-indexing-architecture">MPHF selection — two-phase indexing architecture</h1>
|
||||
<h2 id="indexing-architecture">Indexing architecture</h2>
|
||||
<p>Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of unique kmers in a partition is not known until after counting and filtering.</p>
|
||||
<h3 id="superkmer-vs-kmer-counts">Superkmer vs kmer counts</h3>
|
||||
<p>The <code>SKFileMeta</code> sidecar written by <code>SKFileWriter</code> records <code>instances</code> (unique superkmers) and <code>length_sum</code> (total nucleotides). A superkmer of length L contains L − k + 1 kmers, so the kmer count per partition can be estimated as <code>length_sum − instances × (k − 1)</code>. This is an <strong>overestimate</strong> of unique kmers: two distinct superkmers (different flanking contexts, same minimizer) can share kmers. The exact count of unique kmers is only known after enumerating and deduplicating them.</p>
|
||||
<p>Note: two superkmers sharing a kmer necessarily share the same minimizer and therefore always land in the same partition — no kmer can appear in two different partitions.</p>
|
||||
<h3 id="phase-1-provisional-index-and-spectrum">Phase 1 — provisional index and spectrum</h3>
|
||||
<ol>
|
||||
<li>Enumerate all kmers from the dereplicated superkmers of the partition.</li>
|
||||
<li>Build a provisional MPHF over this key set; capacity is pre-allocated from the sidecar estimate (slight overestimate, harmless).</li>
|
||||
<li>Accumulate counts: for each kmer in each superkmer, <code>count[MPHF(kmer)] += sk.count()</code>.</li>
|
||||
<li>Compute the kmer frequency spectrum (histogram: occurrences → number of kmers).</li>
|
||||
<li>Apply count filter (e.g. discard singletons). After filtering, the exact number of surviving kmers is known.</li>
|
||||
<li>Discard the provisional MPHF.</li>
|
||||
</ol>
|
||||
<h3 id="phase-2-definitive-index">Phase 2 — definitive index</h3>
|
||||
<p>Build a new MPHF over the filtered kmer set only, with the exact key count available. This is the persistent per-partition index used for all downstream operations (queries, set operations).</p>
|
||||
<hr />
|
||||
<h2 id="candidates">Candidates</h2>
|
||||
<p><strong>boomphf</strong> (BBHash algorithm, maintained by 10X Genomics):</p>
|
||||
<ul>
|
||||
<li>~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)</li>
|
||||
<li>Parallel construction; well-tested with DNA kmer data at scale</li>
|
||||
<li>Drawback: largest space footprint of the three</li>
|
||||
<li>Drawback: largest space footprint; streaming construction (no exact count needed) was its main differentiator — irrelevant here since exact count is available at phase 2</li>
|
||||
</ul>
|
||||
<p><strong>ptr_hash</strong> (PtrHash algorithm, Groot Koerkamp, SEA 2025):</p>
|
||||
<ul>
|
||||
<li>~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64 in tight loops) and fastest construction (≥3.1×)</li>
|
||||
<li>Theoretical foundation solid; paper and Rust crate from the same author</li>
|
||||
<li>Requires exact key count at construction — available at phase 2</li>
|
||||
<li>Drawback: published February 2025 — very young, no production track record</li>
|
||||
</ul>
|
||||
<p><strong>FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
|
||||
<ul>
|
||||
<li>~2.1 bits/key — most compact of the three; good query speed; parallelisable construction</li>
|
||||
<li>More established than ptr_hash; actively maintained</li>
|
||||
<li>Currently preferred candidate</li>
|
||||
<li>Works well with overestimated capacity → natural fit for phase 1</li>
|
||||
</ul>
|
||||
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
|
||||
<p><strong>Phase 1</strong> (provisional, discarded after spectrum computation): FMPHGO. Tolerates overestimated capacity, compact, no need to optimise for query speed on a temporary structure.</p>
|
||||
<p><strong>Phase 2</strong> (persistent, queried repeatedly): open between FMPHGO and ptr_hash. Exact key count is available, so both operate optimally. ptr_hash's query speed advantage (2.1–3.3×) is meaningful for the persistent index but carries the risk of a very young crate. FMPHGO is the conservative default; ptr_hash is worth revisiting once it has broader production use.</p>
|
||||
<p>boomphf is effectively eliminated: its space overhead is the largest and its streaming-construction advantage does not apply here.</p>
|
||||
<hr />
|
||||
<h2 id="space-at-scale">Space at scale</h2>
|
||||
<p>For 1 024 partitions × 100 M kmers/partition:</p>
|
||||
<p>For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -875,15 +1102,15 @@
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 3–30 M kmers → 1–8 MB per MPHF, well within RAM.</p>
|
||||
<p>For a human genome at 30× coverage with 1 024 partitions, realistic partition sizes are 3–30 M unique kmers → 1–8 MB per phase-2 MPHF, well within RAM.</p>
|
||||
<h2 id="on-disk-and-mmap-considerations">On-disk and mmap considerations</h2>
|
||||
<p>All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the <code>ph</code> crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 1–8 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.</p>
|
||||
<p>No established Rust crate provides a natively on-disk MPHF. <strong>SSHash</strong> (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.</p>
|
||||
<h2 id="open-questions">Open questions</h2>
|
||||
<ul>
|
||||
<li>Confirm actual partition sizes on representative metagenomic datasets before fixing the choice.</li>
|
||||
<li>Evaluate whether ptr_hash's query speed advantage (2.1–3.3×) justifies adopting a crate that is less than a year old.</li>
|
||||
<li>Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary.</li>
|
||||
<li>Confirm actual partition sizes and overestimation factor on representative metagenomic datasets.</li>
|
||||
<li>Revisit ptr_hash for phase 2 once the crate has broader production track record.</li>
|
||||
<li>Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary for the persistent index.</li>
|
||||
<li>Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.</li>
|
||||
</ul>
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -12,7 +12,7 @@
|
||||
<link rel="prev" href="../chunkreader/">
|
||||
|
||||
|
||||
<link rel="next" href="../storage/">
|
||||
<link rel="next" href="../obipipeline/">
|
||||
|
||||
|
||||
|
||||
@@ -230,7 +230,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -313,6 +313,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -633,6 +661,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
@@ -683,6 +739,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../pipeline/">
|
||||
<link rel="prev" href="../obipipeline/">
|
||||
|
||||
|
||||
<link rel="next" href="../mphf/">
|
||||
@@ -230,7 +230,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -313,6 +313,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -509,6 +537,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
@@ -639,6 +695,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -230,7 +230,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -313,6 +313,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -488,6 +516,17 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#minimizer-sliding-window" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Minimizer sliding window
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -600,6 +639,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
@@ -650,6 +717,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
@@ -796,6 +891,17 @@
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#minimizer-sliding-window" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Minimizer sliding window
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
@@ -828,7 +934,7 @@
|
||||
|
||||
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
|
||||
<h2 id="memory-layout">Memory layout</h2>
|
||||
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):</p>
|
||||
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte):</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
@@ -844,21 +950,44 @@
|
||||
<td>Occurrence count (≤ 16 M)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>SEQL</td>
|
||||
<td>NKMERS</td>
|
||||
<td>8</td>
|
||||
<td>Sequence length in nucleotides (1–256)</td>
|
||||
<td>Number of kmers (= seq_length − k + 1, range 1–255)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] SEQL</code></p>
|
||||
<p>SEQL is stored as a raw <code>u8</code>: values 1–255 represent lengths 1–255; <strong>0 represents 256</strong> (wrapping convention). The public accessor returns a <code>usize</code> and performs the conversion:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="mi">256</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
|
||||
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] NKMERS</code></p>
|
||||
<p>NKMERS is stored as a raw <code>u8</code> in <strong>kmer units</strong>, not nucleotides. The nucleotide length is recovered as <code>NKMERS + k − 1</code>. This avoids the awkward wrapping convention (<code>0 = 256</code>) that would be needed if nucleotide length were stored directly, and gains k−1 = 30 units of headroom:</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>unit</th>
|
||||
<th>u8 covers</th>
|
||||
<th>max nucleotides</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>nucleotides</td>
|
||||
<td>255 nt</td>
|
||||
<td>225 kmers</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>kmers</strong></td>
|
||||
<td><strong>255 kmers</strong></td>
|
||||
<td><strong>285 nt</strong></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The public accessors:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">n_kmers</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">K</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span><span class="w"> </span><span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p>The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k−1 overlap, preserving all kmers without duplication.</p>
|
||||
<p>In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k−1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).</p>
|
||||
<p>The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.</p>
|
||||
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
|
||||
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
|
||||
@@ -883,8 +1012,9 @@
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
|
||||
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 − SEQL × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice<u8, Msb0>::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">SEQL</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
|
||||
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 − seql × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice<u8, Msb0>::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">k</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span>
|
||||
<span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
|
||||
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
|
||||
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
|
||||
</code></pre></div>
|
||||
@@ -900,6 +1030,61 @@
|
||||
return seq -- palindrome: either orientation valid
|
||||
</code></pre></div>
|
||||
</div>
|
||||
<h2 id="minimizer-sliding-window">Minimizer sliding window</h2>
|
||||
<p>Super-kmers are built by <code>SuperKmerIter</code> (crate <code>obiskbuilder</code>), which maintains the current minimizer with a <strong>monotonic deque</strong> over a sliding window of W = k − m + 1 m-mer positions.</p>
|
||||
<p>Each deque entry stores:</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Field</th>
|
||||
<th>Type</th>
|
||||
<th>Purpose</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td><code>position</code></td>
|
||||
<td>usize</td>
|
||||
<td>0-based start of this m-mer in the segment</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>canonical</code></td>
|
||||
<td>u64</td>
|
||||
<td>right-aligned canonical m-mer value (lex-min of fwd and rc); used as partition key</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><code>hash</code></td>
|
||||
<td>u64</td>
|
||||
<td><span class="arithmatex">\(H(\text{canonical})\)</span> — ordering key for random minimizer selection</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The hash <span class="arithmatex">\(H\)</span> is the seeded splitmix64 finalizer (see <a href="../../theory/minimizer/">Minimizer selection</a>):</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">hash_mmer</span><span class="p">(</span><span class="n">canonical</span><span class="p">:</span><span class="w"> </span><span class="kt">u64</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">canonical</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="mh">0x9e3779b97f4a7c15</span><span class="p">;</span><span class="w"> </span><span class="c1">// seed: eliminates fixed point at 0</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">30</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0xbf58476d1ce4e5b9</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">27</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0x94d049bb133111eb</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">31</span><span class="p">)</span>
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p>On each new nucleotide, once the window is full, the deque is updated:</p>
|
||||
<div class="admonition abstract">
|
||||
<p class="admonition-title">Algorithm — minimizer deque update</p>
|
||||
<div class="highlight"><pre><span></span><code>procedure UpdateMinimizer(deque, position, canonical, hash, k, received):
|
||||
-- pop dominated entries from the back
|
||||
while deque.back.hash ≥ hash:
|
||||
deque.pop_back()
|
||||
deque.push_back({position, canonical, hash})
|
||||
|
||||
-- evict expired entries from the front
|
||||
while deque.front.position + k < received:
|
||||
deque.pop_front()
|
||||
</code></pre></div>
|
||||
</div>
|
||||
<p>The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.</p>
|
||||
<p>A super-kmer boundary is emitted when the minimizer changes: <code>deque.front.hash ≠ prev_hash</code>. The <code>canonical</code> field of the front entry is <strong>not</strong> used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key <span class="arithmatex">\(H(\text{canonical})\)</span> can be recomputed independently at routing time from the stored <code>minimizer_pos</code>, without inheriting the minimum-order-statistic bias (see <a href="../../theory/minimizer/#partition-key-independence">Minimizer selection — partition key independence</a>).</p>
|
||||
<h2 id="kmer-extraction">Kmer extraction</h2>
|
||||
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i, k)</code>, which returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">></span>
|
||||
@@ -909,8 +1094,9 @@
|
||||
<div class="admonition abstract">
|
||||
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
|
||||
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
|
||||
n ← ⌈SEQL / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − SEQL × 2 -- padding bits to flush
|
||||
seql ← NKMERS + k − 1 -- nucleotide length
|
||||
n ← ⌈seql / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − seql × 2 -- padding bits to flush
|
||||
|
||||
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
|
||||
lo ← 0 ; hi ← n − 1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
+86
-2
@@ -10,7 +10,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="next" href="theory/kmers/">
|
||||
<link rel="next" href="kmers/">
|
||||
|
||||
|
||||
|
||||
@@ -297,7 +297,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="theory/kmers/" class="md-nav__link">
|
||||
<a href="kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -380,6 +380,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="theory/indexing/" class="md-nav__link">
|
||||
|
||||
@@ -574,6 +602,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -624,6 +680,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -9,16 +9,16 @@
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../..">
|
||||
<link rel="prev" href="..">
|
||||
|
||||
|
||||
<link rel="next" href="../encoding/">
|
||||
<link rel="next" href="../theory/encoding/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<link rel="icon" href="../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
<link rel="stylesheet" href="../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
<script>__md_scope=new URL("..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
@@ -80,7 +80,7 @@
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
<a href=".." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
@@ -138,7 +138,7 @@
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
<a href=".." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
@@ -156,7 +156,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
<a href=".." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -357,7 +357,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../encoding/" class="md-nav__link">
|
||||
<a href="../theory/encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -385,7 +385,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../entropy/" class="md-nav__link">
|
||||
<a href="../theory/entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -413,7 +413,35 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
<a href="../theory/minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../theory/indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -495,7 +523,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||||
<a href="../implementation/superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -523,7 +551,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||||
<a href="../implementation/kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -551,7 +579,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||||
<a href="../implementation/chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -579,7 +607,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||||
<a href="../implementation/pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -607,7 +635,35 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
<a href="../implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../implementation/storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -635,7 +691,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||||
<a href="../implementation/mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -656,6 +712,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
@@ -717,7 +801,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
<a href="../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -846,7 +930,7 @@
|
||||
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
|
||||
</ul>
|
||||
<h2 id="super-kmers">Super-kmers</h2>
|
||||
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m < k, m odd).</p>
|
||||
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m < k, m odd), with the constraint that <strong>non-degenerate m-mers are always preferred</strong> over degenerate ones. A degenerate m-mer is one composed of a single repeated nucleotide (all-A, all-C, all-G, or all-T); such m-mers are selected only if no non-degenerate candidate exists in the window.</p>
|
||||
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
|
||||
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
|
||||
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
|
||||
@@ -919,10 +1003,10 @@
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "..", "features": [], "search": "../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
<script src="../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
Binary file not shown.
@@ -9,7 +9,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../kmers/">
|
||||
<link rel="prev" href="../../kmers/">
|
||||
|
||||
|
||||
<link rel="next" href="../entropy/">
|
||||
@@ -232,7 +232,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -384,6 +384,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
|
||||
@@ -578,6 +606,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -628,6 +684,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
<link rel="prev" href="../encoding/">
|
||||
|
||||
|
||||
<link rel="next" href="../indexing/">
|
||||
<link rel="next" href="../minimizer/">
|
||||
|
||||
|
||||
|
||||
@@ -232,7 +232,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -439,6 +439,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
|
||||
@@ -633,6 +661,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -683,6 +739,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../entropy/">
|
||||
<link rel="prev" href="../minimizer/">
|
||||
|
||||
|
||||
<link rel="next" href="../../implementation/superkmer/">
|
||||
@@ -232,7 +232,7 @@
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
<a href="../../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
@@ -315,6 +315,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../minimizer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Minimizer selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
@@ -578,6 +606,34 @@
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/obipipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
obipipeline library
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
@@ -628,6 +684,34 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Unitig evidence encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,26 +2,34 @@
|
||||
|
||||
## Memory layout
|
||||
|
||||
A super-kmer is stored as a **32-bit header** followed by a **byte-aligned nucleotide sequence** (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):
|
||||
A super-kmer is stored as a **32-bit header** followed by a **byte-aligned nucleotide sequence** (2 bits/base, nucleotide 0 at the MSB of the first byte):
|
||||
|
||||
| Field | Bits | Role |
|
||||
|-------|------|------|
|
||||
| COUNT | 24 | Occurrence count (≤ 16 M) |
|
||||
| SEQL | 8 | Sequence length in nucleotides (1–256) |
|
||||
| NKMERS | 8 | Number of kmers (= seq_length − k + 1, range 1–255) |
|
||||
|
||||
Bit layout (MSB to LSB): `[31:8] COUNT [7:0] SEQL`
|
||||
Bit layout (MSB to LSB): `[31:8] COUNT [7:0] NKMERS`
|
||||
|
||||
SEQL is stored as a raw `u8`: values 1–255 represent lengths 1–255; **0 represents 256** (wrapping convention). The public accessor returns a `usize` and performs the conversion:
|
||||
NKMERS is stored as a raw `u8` in **kmer units**, not nucleotides. The nucleotide length is recovered as `NKMERS + k − 1`. This avoids the awkward wrapping convention (`0 = 256`) that would be needed if nucleotide length were stored directly, and gains k−1 = 30 units of headroom:
|
||||
|
||||
| unit | u8 covers | max nucleotides |
|
||||
|---|---|---|
|
||||
| nucleotides | 255 nt | 225 kmers |
|
||||
| **kmers** | **255 kmers** | **285 nt** |
|
||||
|
||||
The public accessors:
|
||||
|
||||
```rust
|
||||
fn seql(&self) -> usize { if s == 0 { 256 } else { s as usize } }
|
||||
fn n_kmers(&self) -> usize { (self.0 & 0xFF) as usize }
|
||||
fn seql(&self) -> usize { self.n_kmers() + K - 1 }
|
||||
fn count(&self) -> u32 { self.0 >> 8 }
|
||||
fn increment(&mut self) { self.0 += 1 << 8; }
|
||||
fn add(&mut self, n: u32) { self.0 += n << 8; }
|
||||
fn set_count(&mut self, n: u32) { self.0 = (self.0 & 0xFF) | (n << 8); }
|
||||
```
|
||||
|
||||
The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k−1 overlap, preserving all kmers without duplication.
|
||||
In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k−1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).
|
||||
|
||||
The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.
|
||||
|
||||
@@ -61,10 +69,11 @@ const fn revcomp4(x: u8) -> u8 {
|
||||
|
||||
`REVCOMP4` is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.
|
||||
|
||||
**Step 2 — realignment.** After step 1, `padding = n × 8 − SEQL × 2` spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using `BitSlice<u8, Msb0>::rotate_left(padding)` from the `bitvec` crate, which is SIMD-accelerated. The trailing `padding` bits are then zeroed:
|
||||
**Step 2 — realignment.** After step 1, `padding = n × 8 − seql × 2` spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using `BitSlice<u8, Msb0>::rotate_left(padding)` from the `bitvec` crate, which is SIMD-accelerated. The trailing `padding` bits are then zeroed:
|
||||
|
||||
```rust
|
||||
shift = n * 8 - SEQL * 2 // number of padding bits
|
||||
let seql = self.n_kmers() + k - 1;
|
||||
shift = n * 8 - seql * 2 // number of padding bits
|
||||
bits.rotate_left(shift)
|
||||
bits[len - shift..].fill(false)
|
||||
```
|
||||
@@ -141,8 +150,9 @@ The bit slice `seq[i*2 .. (i+k)*2]` (Msb0 order) is loaded as a big-endian `u64`
|
||||
!!! abstract "Algorithm — Super-kmer reverse complement"
|
||||
```text
|
||||
procedure SuperKmerRevcomp(seq, SEQL):
|
||||
n ← ⌈SEQL / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − SEQL × 2 -- padding bits to flush
|
||||
seql ← NKMERS + k − 1 -- nucleotide length
|
||||
n ← ⌈seql / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − seql × 2 -- padding bits to flush
|
||||
|
||||
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
|
||||
lo ← 0 ; hi ← n − 1
|
||||
|
||||
@@ -0,0 +1,232 @@
|
||||
# Unitig-based MPHF evidence encoding
|
||||
|
||||
## Role of unitigs in the index
|
||||
|
||||
The MPHF maps each canonical kmer to an integer slot, but provides no way to reconstruct the kmer from its slot. A downstream operation (query, set operation) that receives a slot index and needs the kmer sequence must be able to retrieve it. The **evidence file** serves this purpose: it stores the kmer sequences in compact form and provides, for each MPHF slot, a pointer to where the corresponding kmer can be decoded.
|
||||
|
||||
Unitigs are the natural compact representation: a run of L nucleotides encodes L − k + 1 consecutive canonical kmers. The entire kmer set of a partition can be reconstructed from its unitig FASTA file.
|
||||
|
||||
---
|
||||
|
||||
## Two encoding strategies
|
||||
|
||||
### Strategy A — global nucleotide offset
|
||||
|
||||
Each MPHF slot stores a single integer: the byte offset of the kmer's first nucleotide within a packed 2-bit nucleotide array that concatenates all unitigs.
|
||||
|
||||
```
|
||||
evidence[slot] = global_offset (bits: ⌈log₂ N_nuc⌉)
|
||||
```
|
||||
|
||||
where `N_nuc` is the total number of nucleotides across all unitigs in the partition.
|
||||
|
||||
Decoding: read k nucleotides starting at `global_offset`.
|
||||
|
||||
### Strategy B — (unitig_id, rank within unitig)
|
||||
|
||||
Each MPHF slot stores a pair:
|
||||
|
||||
```
|
||||
evidence[slot] = (unitig_id, rank)
|
||||
```
|
||||
|
||||
- `unitig_id` : index of the unitig in the partition (0-based)
|
||||
- `rank` : kmer index within the unitig (0 ≤ rank < n_kmers); kmer i starts at nucleotide i, so the nucleotide offset is identical numerically but the kmer-unit interpretation is the natural one
|
||||
|
||||
Decoding: look up the unitig at `unitig_id`, then read k nucleotides starting at `rank`.
|
||||
|
||||
---
|
||||
|
||||
## Bit-cost analysis
|
||||
|
||||
Define for a partition of P kmers with average kmers-per-unitig m:
|
||||
|
||||
- total nucleotides: $N_{nuc} = P \cdot \left(1 + \dfrac{k-1}{m}\right)$
|
||||
- number of unitigs: $U = P / m$
|
||||
|
||||
**Strategy A**
|
||||
|
||||
$$
|
||||
b_A = \left\lceil \log_2 N_{nuc} \right\rceil = \left\lceil \log_2 P + \log_2\!\left(1 + \frac{k-1}{m}\right) \right\rceil
|
||||
$$
|
||||
|
||||
**Strategy B**
|
||||
|
||||
$$
|
||||
b_B = \left\lceil \log_2 U \right\rceil + \left\lceil \log_2 L_{max} \right\rceil
|
||||
$$
|
||||
|
||||
where $L_{max}$ is the maximum unitig length (in nucleotides). In practice $L_{max} \ll P$, so the rank field is much cheaper than the full global offset. If unitig lengths are bounded (e.g. by partition structure), the rank field width is a small constant independent of P.
|
||||
|
||||
### Empirical bound on unitig length
|
||||
|
||||
Lengths and ranks are expressed in **kmer units** (not nucleotides): the nucleotide length is `n_kmers + k − 1`, so storing `n_kmers` instead of `seq_length` saves k−1 = 30 units of headroom in the same field width.
|
||||
|
||||
Consequence for `u8` capacity:
|
||||
|
||||
| unit | max representable | max nucleotides |
|
||||
|---|---|---|
|
||||
| nucleotides | 255 nuc | 225 kmers |
|
||||
| **kmers** | **255 kmers** | **285 nuc** |
|
||||
|
||||
On *Betula nana* (k=31, 256 partitions), m_u ≈ 37.9 kmers/unitig on average; no unitig length distribution data measured yet. The `rank` field (kmer index within the unitig) fits in a `u8` as long as no unitig exceeds 255 kmers — guaranteed by the split strategy below.
|
||||
|
||||
### Split strategy for long unitigs
|
||||
|
||||
For the rare cases where a unitig exceeds 255 kmers, the unitig is split into chunks of at most 255 kmers, with a **k−1 nucleotide overlap** at each junction — identical to the way super-kmers are delimited at partition boundaries. Each chunk is self-contained and independently decodable.
|
||||
|
||||
```
|
||||
original unitig: kmer_0 … kmer_254 | kmer_255 … kmer_N
|
||||
↑ cut here
|
||||
|
||||
chunk 1: nucleotides 0 … 284 (255 kmers)
|
||||
chunk 2: nucleotides 255 … N+k-1 (N-255+1 kmers)
|
||||
shared: nucleotides 255 … 284 (k-1 = 30 nucleotides, stored in both)
|
||||
```
|
||||
|
||||
Cost of one split: k−1 = 30 redundant nucleotides = 60 bits. This event is rare in practice (m_u ≈ 38 for *B. nana*, well below the 255-kmer cap). No kmer is lost: kmer i is in chunk 1 if i < 255, in chunk 2 (at rank i−255) otherwise.
|
||||
|
||||
### Savings from u8 length fields
|
||||
|
||||
Because all chunks are guaranteed ≤ 255 kmers, the per-chunk length array in the binary index is a flat `u8` array — 1 byte per chunk instead of 8 bytes (usize) or 4 bytes (u32). For a partition with 4 M unitigs:
|
||||
|
||||
| length type | bytes/chunk | total (4 M chunks) |
|
||||
|---|---|---|
|
||||
| usize (u64) | 8 | 32 MB |
|
||||
| u32 | 4 | 16 MB |
|
||||
| **u8** | **1** | **4 MB** |
|
||||
|
||||
Random access to chunk i is recovered at load time by a single prefix-sum pass over the u8 array, computing a u32/u64 offset array in O(n_chunks) time and O(n_chunks × 4) bytes — paid once at open time, cached for the lifetime of the partition handle.
|
||||
|
||||
Bit costs for *Betula nana* (k=31, 256 partitions, P ≈ 10.4 M, U ≈ 275 k, m_u ≈ 37.9):
|
||||
|
||||
| field | strategy A | strategy B |
|
||||
|---|---|---|
|
||||
| offset / id | $\lceil\log_2(P \cdot (1 + 30/m_u))\rceil = 25$ bits | $\lceil\log_2(U)\rceil = 19$ bits |
|
||||
| rank | — | 8 bits (u8, fixed) |
|
||||
| **total** | **25 bits** | **27 bits** |
|
||||
|
||||
Strategy A is 2 bits cheaper. Strategy B's main advantage is **locality**: decoding a kmer touches one unitig's cache lines rather than an arbitrary offset in a large flat array, and the `rank` field doubles as a direct index into the packed nucleotide sequence without pointer arithmetic.
|
||||
|
||||
---
|
||||
|
||||
## Partition-size tradeoff
|
||||
|
||||
The total bits/kmer for the index (sequence + evidence + MPHF) as a function of partition size is:
|
||||
|
||||
$$
|
||||
\text{total} = \underbrace{2\!\left(1 + \frac{k-1}{m}\right)}_{\text{sequence}} + \underbrace{\log_2 P + \log_2\!\left(1+\frac{k-1}{m}\right)}_{\text{evidence}} + \underbrace{c_{MPHF}}_{\approx 2\text{–}4}
|
||||
$$
|
||||
|
||||
### Empirical observation: m_u is set by De Bruijn graph topology, not partition count
|
||||
|
||||
Measured on *Betula nana* (k=31, m=11), summing n_kmers and sequence counts across all partition files:
|
||||
|
||||
| N partitions | m_sk | m_u | factor m_u/m_sk | nuc ratio (u/sk) |
|
||||
|---|---|---|---|---|
|
||||
| 1 | 12.13 | **41.89** | 3.45× | 0.273 |
|
||||
| 16 | 12.13 | **38.19** | 3.15× | 0.376 |
|
||||
| 256 | 12.13 | **37.90** | 3.12× | 0.388 |
|
||||
| 1 024 | 12.13 | **37.89** | 3.12× | 0.389 |
|
||||
|
||||
- `m_sk` = avg kmers/super-kmer (invariant — same dataset regardless of partition scheme)
|
||||
- `m_u` = avg kmers/unitig = total_n_kmers / total_unitigs, summed across all partitions
|
||||
- `nuc ratio` = (u_symbols + 30·u_reads) / (sk_symbols + 30·sk_reads)
|
||||
|
||||
X-axis in both charts: partition bits (0 = 1 partition, 10 = 1024 partitions) — each step doubles the partition count.
|
||||
|
||||
```mermaid
|
||||
xychart-beta
|
||||
title "m_u (avg kmers/unitig) vs partition bits — B. nana k=31"
|
||||
x-axis "partition bits" 0 --> 10
|
||||
y-axis "m_u" 37 --> 43
|
||||
line [41.89, 40.78, 39.22, 38.52, 38.19, 38.03, 37.96, 37.92, 37.90, 37.89, 37.89]
|
||||
```
|
||||
|
||||
```mermaid
|
||||
xychart-beta
|
||||
title "Nucleotide storage: unitigs / super-kmers (%) vs partition bits — B. nana k=31"
|
||||
x-axis "partition bits" 0 --> 10
|
||||
y-axis "%" 25 --> 42
|
||||
line [27.3, 29.7, 33.9, 36.3, 37.6, 38.3, 38.6, 38.7, 38.8, 38.9, 38.9]
|
||||
```
|
||||
|
||||
Key observations:
|
||||
|
||||
1. **Partition boundaries have a small but non-zero effect on m_u.** Going from 1 to 1024 partitions reduces m_u by 10% (41.9 → 37.9). Within the practical range 16–1024, the variation is under 1% — m_u is effectively constant.
|
||||
2. **m_u is a property of the De Bruijn graph, not the partition scheme.** The dominant factor is graph branching (heterozygosity, repeats, sequencing errors).
|
||||
3. **Unitigs provide substantial compaction over super-kmers.** At 256 partitions, unitigs cover the same unique kmers using 39% of the raw nucleotide content of super-kmers (3.1× compaction factor).
|
||||
|
||||
#### Per-partition compaction ratio (sk_symbols / u_symbols)
|
||||
|
||||
The ratio measures how much super-kmer kmer-slots are "shared" across different super-kmer records: a ratio of 1.35 means each unique kmer (counted once in unitigs) appears in 1.35 super-kmer kmer-slots on average.
|
||||
|
||||
| bits | N partitions | median ratio | min ratio | min partition | min u_reads |
|
||||
|---|---|---|---|---|---|
|
||||
| 6 | 64 | 1.355 | 1.073 | — | 4.5 M |
|
||||
| 7 | 128 | 1.352 | 1.037 | — | 4.1 M |
|
||||
| 8 | 256 | **1.350** | **1.012** | **145** | **3.8 M** |
|
||||
| 9 | 512 | 1.350 | 0.998 | 145 | 3.6 M |
|
||||
| 10 | 1024 | 1.351 | 0.992 | 145 | 3.6 M |
|
||||
|
||||
The median stabilises at **1.35** from 64 partitions onward (stdev = 0.027 at 256 partitions). There is one persistent outlier: **partition 145** (at 256-partition resolution) is consistently anomalous across all partition depths — it contains 10–14× more super-kmers and unitigs than the average partition, with a ratio near 1.0, meaning the unitig representation provides almost no kmer deduplication. This is consistent with a highly repetitive or organellar region where the dominant minimiser belongs to a sequence that appears in many reads without forming long overlapping paths in the De Bruijn graph.
|
||||
|
||||
Per-partition parameters at 256 partitions (*B. nana*):
|
||||
|
||||
| quantity | value |
|
||||
|---|---|
|
||||
| P (unique kmers/partition, avg) | ≈ 10.4 M |
|
||||
| U (unitigs/partition, avg) | ≈ 275 k |
|
||||
| m_u | ≈ 37.9 |
|
||||
| Strategy A bits/kmer | ⌈log₂(P·(1+30/m_u))⌉ = 25 |
|
||||
| Strategy B bits/kmer | ⌈log₂(U)⌉ + 8 = 27 |
|
||||
|
||||
Consequence: **the partition count should be as large as memory and parallelism allow.** Each doubling saves 1 bit/kmer in evidence (log₂ P decreases by 1). The sequence term 2·(1 + 30/m_u) ≈ 3.6 bits/kmer is approximately constant.
|
||||
|
||||
Strategy B partially decouples evidence cost from P: `log₂(U) = log₂(P/m_u)` grows more slowly than `log₂(P)` by a fixed log₂(m_u) ≈ 5 bits. Strategy B's main benefit remains locality and bounded rank width, not asymptotic compression.
|
||||
|
||||
---
|
||||
|
||||
## Implementation notes
|
||||
|
||||
### Evidence file layout (strategy B)
|
||||
|
||||
```
|
||||
evidence.bin
|
||||
├── header : k (u8), n_kmers (u64), n_unitigs (u64)
|
||||
├── id_array : n_kmers × ⌈log₂ n_unitigs⌉ bits — MPHF slot → unitig_id
|
||||
└── rank_array: n_kmers × 8 bits (u8[n_kmers]) — MPHF slot → rank within unitig
|
||||
```
|
||||
|
||||
`id_array` is a compact bit-packed vector (width = ⌈log₂ n_unitigs⌉; 19 bits for *B. nana* at 256 partitions). `rank_array` is a plain `u8` array — no bit-packing needed. Access is O(1) with a single multiplication and mask for `id_array`, and a direct byte index for `rank_array`.
|
||||
|
||||
### Unitig file layout
|
||||
|
||||
FASTA with JSON annotation header (xxHash-64 ID, seq_length, kmer_size, n_kmers). The nucleotide sequence is stored in ASCII uppercase; a 2-bit packed version is derived at query time or stored as a parallel `.2bit` file for speed.
|
||||
|
||||
```
|
||||
>c4a1e7f2 {"seq_length":87,"kmer_size":31,"n_kmers":57}
|
||||
ACGTGGCTA...
|
||||
```
|
||||
|
||||
### Decoding a kmer from slot s
|
||||
|
||||
```
|
||||
unitig_id = id_array[s]
|
||||
rank = rank_array[s]
|
||||
kmer = nucleotides(unitig_id)[rank .. rank + k] // 2-bit packed slice
|
||||
```
|
||||
|
||||
One array lookup per field, then a packed slice extraction. The canonical kmer is the stored sequence (by construction — only canonical kmers are inserted into the graph).
|
||||
|
||||
### Forward vs reverse complement
|
||||
|
||||
The De Bruijn graph stores only canonical kmers. The evidence encodes the canonical orientation. Callers that need the strand of the original kmer must compare the retrieved kmer with its revcomp at query time; this is a single 64-bit comparison.
|
||||
|
||||
---
|
||||
|
||||
## Open questions
|
||||
|
||||
- **Rank field width**: u8 covers 255 kmers; storing lengths and ranks in kmer units (not nucleotides) buys k−1 extra units of headroom at no cost. On *B. nana* (k=31), m_u ≈ 38 — well within u8 range on average, but the maximum unitig length has not been measured yet. For genomes with very long unitigs, u16 may be needed; the header could record the actual width if portability is required.
|
||||
- **Packed nucleotide cache**: storing a 2-bit packed nucleotide array alongside the FASTA avoids re-encoding at query time; negligible space overhead ($N_{nuc} / 4$ bytes per partition).
|
||||
- **Cross-partition evidence**: for set operations spanning multiple partitions, strategy B allows unitig-level operations (e.g. mark entire unitigs as present/absent) rather than kmer-level, potentially reducing the operation cost by a factor of m.
|
||||
File diff suppressed because it is too large
Load Diff
+6
-1
@@ -16,7 +16,11 @@ markdown_extensions:
|
||||
- admonition
|
||||
- footnotes
|
||||
- tables
|
||||
- pymdownx.superfences
|
||||
- pymdownx.superfences:
|
||||
custom_fences:
|
||||
- name: mermaid
|
||||
class: mermaid
|
||||
format: !!python/name:pymdownx.superfences.fence_code_format
|
||||
- pymdownx.arithmatex:
|
||||
generic: true
|
||||
|
||||
@@ -39,6 +43,7 @@ nav:
|
||||
- obipipeline library: implementation/obipipeline.md
|
||||
- On-disk storage: implementation/storage.md
|
||||
- MPHF selection: implementation/mphf.md
|
||||
- Unitig evidence encoding: implementation/unitig_evidence.md
|
||||
- Architecture:
|
||||
- Sequences: architecture/sequences/invariant.md
|
||||
|
||||
|
||||
Executable
+106
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare the canonical k-mer sets of two FASTA files.
|
||||
|
||||
Reports how many k-mers are shared, exclusive to each file, or missing.
|
||||
Handles plain and gzip-compressed FASTA (.gz).
|
||||
|
||||
Usage
|
||||
-----
|
||||
compare_kmers.py -k 31 file_a.fasta.gz file_b.fasta.gz
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
COMP = str.maketrans("ACGTacgt", "TGCAtgca")
|
||||
|
||||
|
||||
def revcomp(seq: str) -> str:
|
||||
return seq.translate(COMP)[::-1]
|
||||
|
||||
|
||||
def canonical(seq: str) -> str:
|
||||
rc = revcomp(seq)
|
||||
return seq if seq <= rc else rc
|
||||
|
||||
|
||||
def open_fasta(path: str):
|
||||
p = Path(path)
|
||||
if p.suffix == ".gz":
|
||||
return gzip.open(path, "rt")
|
||||
return open(path, "r")
|
||||
|
||||
|
||||
def iter_sequences(path: str):
|
||||
"""Yield (header, sequence) pairs from a FASTA file."""
|
||||
header = None
|
||||
parts = []
|
||||
with open_fasta(path) as fh:
|
||||
for line in fh:
|
||||
line = line.rstrip()
|
||||
if line.startswith(">"):
|
||||
if header is not None:
|
||||
yield header, "".join(parts)
|
||||
header = line[1:]
|
||||
parts = []
|
||||
else:
|
||||
parts.append(line.upper())
|
||||
if header is not None:
|
||||
yield header, "".join(parts)
|
||||
|
||||
|
||||
def extract_kmers(path: str, k: int) -> set[str]:
|
||||
"""Return the set of canonical k-mers from all sequences in *path*."""
|
||||
kmers: set[str] = set()
|
||||
for _, seq in iter_sequences(path):
|
||||
# skip any character that is not ACGT
|
||||
for i in range(len(seq) - k + 1):
|
||||
kmer = seq[i : i + k]
|
||||
if all(c in "ACGT" for c in kmer):
|
||||
kmers.add(canonical(kmer))
|
||||
return kmers
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare canonical k-mer sets between two FASTA files."
|
||||
)
|
||||
parser.add_argument("file_a", help="First FASTA file (reference)")
|
||||
parser.add_argument("file_b", help="Second FASTA file (to compare)")
|
||||
parser.add_argument(
|
||||
"-k", "--kmer-size", type=int, default=31, metavar="K", help="k-mer size (default: 31)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
k = args.kmer_size
|
||||
print(f"k = {k}")
|
||||
print(f"A = {args.file_a}")
|
||||
print(f"B = {args.file_b}")
|
||||
print()
|
||||
|
||||
print("reading A …", file=sys.stderr)
|
||||
set_a = extract_kmers(args.file_a, k)
|
||||
print("reading B …", file=sys.stderr)
|
||||
set_b = extract_kmers(args.file_b, k)
|
||||
|
||||
only_a = set_a - set_b
|
||||
only_b = set_b - set_a
|
||||
common = set_a & set_b
|
||||
|
||||
print(f"{'kmers in A':<25} {len(set_a):>12,}")
|
||||
print(f"{'kmers in B':<25} {len(set_b):>12,}")
|
||||
print(f"{'common':<25} {len(common):>12,}")
|
||||
print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
|
||||
print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
|
||||
|
||||
if only_a or only_b:
|
||||
print("\nSets differ.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("\nSets are identical.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+125
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env bash
|
||||
# jj_commit_msg.sh — generate a commit message from the current jj change using aichat
|
||||
#
|
||||
# Usage: jj_commit_msg.sh
|
||||
# Summarises each changed file's diff individually, then combines all
|
||||
# summaries into a single commit message via aichat.
|
||||
#
|
||||
# Typical use:
|
||||
# jj describe -m "$(jj_commit_msg.sh)"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Log to stderr so progress doesn't pollute the commit message on stdout
|
||||
log() { printf '\033[1;34m==>\033[0m %s\n' "$*" >&2; }
|
||||
info() { printf ' \033[0;37m%s\033[0m\n' "$*" >&2; }
|
||||
ok() { printf ' \033[0;32m✓\033[0m %s\n' "$*" >&2; }
|
||||
|
||||
# _readable_diff <file>
|
||||
# Returns a human-readable diff for <file>.
|
||||
# For pathological single-line formats (JSON, minified JS/CSS…), pretty-prints
|
||||
# both the parent and working versions before diffing so the LLM sees
|
||||
# structured changes rather than one enormous ±line.
|
||||
_readable_diff() {
|
||||
local file="$1"
|
||||
local raw_diff
|
||||
raw_diff=$(jj diff -- "$file")
|
||||
[[ -z "$raw_diff" ]] && return 0
|
||||
|
||||
# Detect pathological diff: any +/- content line longer than 500 chars
|
||||
local max_len
|
||||
max_len=$(grep '^[+-]' <<< "$raw_diff" | awk '{ if (length > m) m = length } END { print m+0 }')
|
||||
|
||||
if (( max_len <= 500 )); then
|
||||
printf '%s' "$raw_diff"
|
||||
return
|
||||
fi
|
||||
|
||||
# Pretty-print strategy per extension
|
||||
local ext="${file##*.}"
|
||||
local pretty_old pretty_new
|
||||
case "$ext" in
|
||||
json)
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
|
||||
;;
|
||||
js|mjs|cjs|css|ts)
|
||||
local node_fmt='
|
||||
const chunks = [];
|
||||
process.stdin.on("data", d => chunks.push(d));
|
||||
process.stdin.on("end", () => {
|
||||
const src = chunks.join("");
|
||||
// Insert newline before { } ( ) ; and after ,
|
||||
const out = src
|
||||
.replace(/([{(])/g, "$1\n ")
|
||||
.replace(/([;}])/g, "\n$1\n")
|
||||
.replace(/,\s*/g, ",\n ");
|
||||
process.stdout.write(out);
|
||||
});'
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
|
||||
;;
|
||||
*)
|
||||
# Generic fallback: fold long lines at 120 chars
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | fold -s -w 120 || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | fold -s -w 120 || true)
|
||||
;;
|
||||
esac
|
||||
|
||||
if [[ -n "$pretty_old" && -n "$pretty_new" ]]; then
|
||||
diff <(printf '%s\n' "$pretty_old") <(printf '%s\n' "$pretty_new") \
|
||||
--label "a/${file}" --label "b/${file}" -u || true
|
||||
else
|
||||
printf '%s' "$raw_diff"
|
||||
fi
|
||||
}
|
||||
|
||||
# Collect changed files in the current working copy change
|
||||
changed_files=$(jj diff --name-only)
|
||||
|
||||
if [[ -z "$changed_files" ]]; then
|
||||
echo "No changed files." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
file_count=$(wc -l <<< "$changed_files" | tr -d ' ')
|
||||
log "Found $file_count changed file(s)"
|
||||
|
||||
summaries=""
|
||||
n=0
|
||||
|
||||
while IFS= read -r file; do
|
||||
diff=$(_readable_diff "$file")
|
||||
if [[ -z "$diff" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
n=$((n + 1))
|
||||
log "[$n/$file_count] Summarising $file …"
|
||||
|
||||
summary=$(printf '%s' "$diff" | aichat "In 2-3 lines, summarise what this diff changes in the file '$file'. Be concise and technical.")
|
||||
|
||||
# Print the summary indented to stderr
|
||||
while IFS= read -r line; do
|
||||
info "$line"
|
||||
done <<< "$summary"
|
||||
|
||||
summaries+="### $file
|
||||
$summary
|
||||
|
||||
"
|
||||
done <<< "$changed_files"
|
||||
|
||||
if [[ -z "$summaries" ]]; then
|
||||
echo "No non-empty diffs found." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Generating commit message from $n summary/summaries …"
|
||||
result=$(printf '%s' "$summaries" | aichat "From these per-file summaries of a jj diff, write a single conventional commit message in English. First line: short imperative summary (max 72 chars). Then a blank line. Then a short paragraph with more detail if needed. Output only the commit message, nothing else.")
|
||||
|
||||
ok "Done"
|
||||
printf '\n' >&2
|
||||
|
||||
# Commit message goes to stdout
|
||||
printf '%s\n' "$result"
|
||||
@@ -1 +0,0 @@
|
||||
Eric Coissac,coissac,mac.lan,20.04.2026 19:13,file:///Users/coissac/Library/Application%20Support/LibreOffice/4;
|
||||
Generated
+8
@@ -1590,6 +1590,9 @@ name = "obikmer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"memmap2",
|
||||
"niffler 3.0.0",
|
||||
"obidebruinj",
|
||||
"obifastwrite",
|
||||
"obikpartitionner",
|
||||
"obikrope",
|
||||
@@ -1597,7 +1600,10 @@ dependencies = [
|
||||
"obipipeline",
|
||||
"obiread",
|
||||
"obiskbuilder",
|
||||
"obiskio",
|
||||
"ph",
|
||||
"pprof",
|
||||
"rayon",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
]
|
||||
@@ -1633,6 +1639,8 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"bitvec",
|
||||
"criterion2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"xxhash-rust",
|
||||
]
|
||||
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
use std::fmt;
|
||||
use std::io::{self, Write};
|
||||
use xxhash_rust::xxh64::xxh64;
|
||||
|
||||
pub(crate) enum JsonVal<'a> {
|
||||
Num(u64),
|
||||
Str(&'a str),
|
||||
}
|
||||
|
||||
impl fmt::Display for JsonVal<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
JsonVal::Num(n) => write!(f, "{n}"),
|
||||
JsonVal::Str(s) => write!(f, "\"{s}\""),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seq_id(ascii: &[u8]) -> String {
|
||||
format!("{:016X}", xxh64(ascii, 0))
|
||||
}
|
||||
|
||||
pub(crate) fn annotation<W: Write>(
|
||||
writer: &mut W,
|
||||
fields: &[(&str, JsonVal<'_>)],
|
||||
) -> io::Result<()> {
|
||||
write!(writer, "{{")?;
|
||||
for (i, (k, v)) in fields.iter().enumerate() {
|
||||
if i > 0 {
|
||||
write!(writer, ",")?;
|
||||
}
|
||||
write!(writer, "\"{k}\":{v}")?;
|
||||
}
|
||||
write!(writer, "}}")
|
||||
}
|
||||
|
||||
pub(crate) fn write_sequence<W: Write>(writer: &mut W, seq: &[u8], width: usize) -> io::Result<()> {
|
||||
for chunk in seq.chunks(width) {
|
||||
// SAFETY: seq is valid UTF-8; any contiguous slice of ASCII bytes is too
|
||||
writeln!(writer, "{}", unsafe { std::str::from_utf8_unchecked(chunk) })?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
+15
-24
@@ -30,6 +30,8 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod fasta;
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use obikseq::{kmer::Kmer, superkmer::SuperKmer, unitig::Unitig};
|
||||
@@ -168,8 +170,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_header_contains_minimizer_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(2);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 7, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"minimizer\":\""));
|
||||
assert!(!out.contains("\"count\":"));
|
||||
@@ -178,16 +179,14 @@ mod tests {
|
||||
#[test]
|
||||
fn scatter_minimizer_decoded_from_hash() {
|
||||
// min_hash for "ACG" (A=0,C=1,G=2, m=3): 0*16 + 1*4 + 2 = 6
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 0, Kmer::from_raw_right(6, 3)));
|
||||
assert!(out.contains("\"minimizer\":\"ACG\""), "got: {out}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scatter_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 3, 5, Kmer::from_raw(0)));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -197,8 +196,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn scatter_sequence_line_correct() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "ACGTACGT");
|
||||
@@ -209,7 +207,6 @@ mod tests {
|
||||
#[test]
|
||||
fn count_header_contains_count_field() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
sk.add(49);
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 2));
|
||||
assert!(out.contains("\"count\":50"));
|
||||
@@ -218,8 +215,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_fields_present() {
|
||||
let mut sk = make(b"ACGTACGTACGT");
|
||||
sk.init_count();
|
||||
let sk = make(b"ACGTACGTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 3, 9));
|
||||
assert!(out.contains("\"seq_length\":12"));
|
||||
assert!(out.contains("\"kmer_size\":4"));
|
||||
@@ -230,21 +226,19 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn count_sequence_line_correct() {
|
||||
let mut sk = make(b"TTTTACGT");
|
||||
sk.init_count();
|
||||
// TTTTACGT canonicalises to ACGTAAAA (revcomp is ACGTAAAA < TTTTACGT)
|
||||
let sk = make(b"TTTTACGT");
|
||||
let out = capture(|w| write_count(&sk, w, 4, 2, 0));
|
||||
let lines: Vec<&str> = out.lines().collect();
|
||||
assert_eq!(lines[1], "TTTTACGT");
|
||||
assert_eq!(lines[1], "ACGTAAAA");
|
||||
}
|
||||
|
||||
// ── ID stability ──────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn same_sequence_same_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"ACGTACGT");
|
||||
sk2.set_minimizer_pos(4); // different pos, same sequence
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"ACGTACGT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -267,10 +261,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn different_sequences_different_id() {
|
||||
let mut sk1 = make(b"ACGTACGT");
|
||||
sk1.set_minimizer_pos(0);
|
||||
let mut sk2 = make(b"TTTTTTTT");
|
||||
sk2.set_minimizer_pos(0);
|
||||
let sk1 = make(b"ACGTACGT");
|
||||
let sk2 = make(b"TTTTTTTT");
|
||||
|
||||
let id1 = capture(|w| write_scatter(&sk1, w, 4, 2, 0, Kmer::from_raw(0)))
|
||||
.lines()
|
||||
@@ -293,8 +285,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn id_is_16_hex_digits() {
|
||||
let mut sk = make(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
let sk = make(b"ACGTACGT");
|
||||
let out = capture(|w| write_scatter(&sk, w, 4, 2, 0, Kmer::from_raw(0)));
|
||||
let id = &out.lines().next().unwrap()[1..17]; // skip '>'
|
||||
assert_eq!(id.len(), 16);
|
||||
|
||||
@@ -13,9 +13,15 @@ obiread = { path = "../obiread" }
|
||||
obiskbuilder = { path = "../obiskbuilder" }
|
||||
obifastwrite = { path = "../obifastwrite" }
|
||||
obipipeline = { path = "../obipipeline" }
|
||||
obidebruinj = { path = "../obidebruinj" }
|
||||
clap = { version = "4", features = ["derive"] }
|
||||
obikrope = { path = "../obikrope" }
|
||||
obikpartitionner = { path = "../obikpartitionner" }
|
||||
obiskio = { path = "../obiskio" }
|
||||
niffler = "3"
|
||||
rayon = "1"
|
||||
ph = "0.11"
|
||||
memmap2 = "0.9"
|
||||
tracing = "0.1.44"
|
||||
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
|
||||
pprof = { version = "0.13", features = ["prost-codec"], optional = true }
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikrope::Rope;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
// ── Shared arguments ──────────────────────────────────────────────────────────
|
||||
|
||||
@@ -57,7 +57,7 @@ pub enum PipelineData {
|
||||
Path(PathBuf),
|
||||
RawChunk(Rope),
|
||||
NormChunk(Rope),
|
||||
Batch(Vec<SuperKmer>),
|
||||
Batch(Vec<RoutableSuperKmer>),
|
||||
}
|
||||
|
||||
// SAFETY: Rope contains Cell<u8> which is !Sync, but pipeline ownership transfers
|
||||
|
||||
@@ -0,0 +1,84 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obifastwrite::write_count;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obiskio::SKFileReader;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct FastaArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Dump dereplicated super-kmers as FASTA (→ <partition>/dereplicated.skmer.fasta.gz)
|
||||
#[arg(long)]
|
||||
pub super_kmers: bool,
|
||||
}
|
||||
|
||||
pub fn run(args: FastaArgs) {
|
||||
if !args.super_kmers {
|
||||
eprintln!("error: specify at least one output mode (--super-kmers)");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
if args.super_kmers {
|
||||
dump_super_kmers(&kp, &args.partition);
|
||||
}
|
||||
}
|
||||
|
||||
fn dump_super_kmers(kp: &KmerPartition, partition_dir: &PathBuf) {
|
||||
let k = kp.kmer_size();
|
||||
let m = kp.minimizer_size();
|
||||
let n = kp.n_partitions();
|
||||
|
||||
info!("writing {n} partition FASTA files (parallel)");
|
||||
|
||||
let total = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = partition_dir.join(format!("part_{i:05}"));
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("dereplicated.skmer.fasta.gz");
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut count = 0usize;
|
||||
for sk in reader.iter() {
|
||||
write_count(&sk, &mut writer, k, m, i as u32).unwrap_or_else(|e| {
|
||||
eprintln!("write error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
count += 1;
|
||||
}
|
||||
info!("partition {i}: {count} super-kmers → {}", out_path.display());
|
||||
total.fetch_add(count, Ordering::Relaxed);
|
||||
});
|
||||
|
||||
info!("wrote {} super-kmers total", total.load(Ordering::Relaxed));
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
pub mod count;
|
||||
pub mod fasta;
|
||||
pub mod partition;
|
||||
pub mod superkmer;
|
||||
pub mod unitig;
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cli::{CommonArgs, PipelineData, open_chunks};
|
||||
@@ -39,14 +39,14 @@ pub fn run(args: PartitionArgs) {
|
||||
let path_source = args.common.seqfile_paths();
|
||||
|
||||
let pipe = obipipeline::make_pipe! {
|
||||
PipelineData : PathBuf => Vec<SuperKmer>,
|
||||
PipelineData : PathBuf => Vec<RoutableSuperKmer>,
|
||||
||? { |path| open_chunks(path) } : Path => RawChunk,
|
||||
|? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk,
|
||||
| { move |rope| obiskbuilder::build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||
};
|
||||
|
||||
for mut batch in pipe.apply(path_source, n_workers, 1) {
|
||||
kp.write_batch(&mut batch).unwrap_or_else(|e| {
|
||||
for batch in pipe.apply(path_source, n_workers, 1) {
|
||||
kp.write_batch(batch).unwrap_or_else(|e| {
|
||||
eprintln!("error: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::path::PathBuf;
|
||||
|
||||
use clap::Args;
|
||||
use obifastwrite::write_scatter;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::cli::{CommonArgs, PipelineData, open_chunks};
|
||||
|
||||
@@ -16,20 +16,17 @@ pub struct SuperkmerArgs {
|
||||
// ── Stage functions ───────────────────────────────────────────────────────────
|
||||
|
||||
fn write_batch(
|
||||
batch: Vec<SuperKmer>,
|
||||
batch: Vec<RoutableSuperKmer>,
|
||||
out: &mut BufWriter<io::Stdout>,
|
||||
partition_bits: usize,
|
||||
k: usize,
|
||||
m: usize,
|
||||
) -> io::Result<()> {
|
||||
let partition_mask = (1u64 << partition_bits) - 1;
|
||||
for sk in batch {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, m)
|
||||
.map_err(io::Error::other)?
|
||||
.canonical(m);
|
||||
for rsk in batch {
|
||||
let minimizer = *rsk.minimizer();
|
||||
let partition = (minimizer.hash(m) & partition_mask) as usize;
|
||||
write_scatter(&sk, out, k, m, partition, minimizer)?;
|
||||
write_scatter(rsk.superkmer(), out, k, m, partition, minimizer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -47,7 +44,7 @@ pub fn run(args: SuperkmerArgs) {
|
||||
let path_source = args.common.seqfile_paths();
|
||||
|
||||
let pipe = obipipeline::make_pipe! {
|
||||
PipelineData : PathBuf => Vec<SuperKmer>,
|
||||
PipelineData : PathBuf => Vec<RoutableSuperKmer>,
|
||||
||? { |path| open_chunks(path) } : Path => RawChunk,
|
||||
|? { move |rope| obiread::normalize_sequence_chunk(rope, k) } : RawChunk => NormChunk,
|
||||
| { move |rope| obiskbuilder::build_superkmers(rope, k, m, level_max, theta) }: NormChunk => Batch,
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
use std::fs::File;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use clap::Args;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obidebruinj::GraphDeBruijn;
|
||||
use obikpartitionner::KmerPartition;
|
||||
use obiskio::SKFileReader;
|
||||
use ph::fmph::GOFunction;
|
||||
use rayon::prelude::*;
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct UnitigArgs {
|
||||
/// Root of the k-mer partition directory (produced by the `partition` command)
|
||||
pub partition: PathBuf,
|
||||
|
||||
/// Minimum kmer abundance (inclusive); kmers below this threshold are excluded
|
||||
#[arg(long, default_value_t = 1)]
|
||||
pub min_abundance: u32,
|
||||
|
||||
/// Maximum kmer abundance (inclusive); kmers above this threshold are excluded
|
||||
#[arg(long)]
|
||||
pub max_abundance: Option<u32>,
|
||||
}
|
||||
|
||||
pub fn run(args: UnitigArgs) {
|
||||
let kp = KmerPartition::open(&args.partition).unwrap_or_else(|e| {
|
||||
eprintln!("error opening partition: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
|
||||
let k = kp.kmer_size();
|
||||
let n = kp.n_partitions();
|
||||
info!("building unitigs from {n} partitions (k={k}, parallel)");
|
||||
|
||||
let total_kmers = AtomicUsize::new(0);
|
||||
|
||||
(0..n).into_par_iter().for_each(|i| {
|
||||
let part_dir = args.partition.join(format!("part_{i:05}"));
|
||||
let in_path = part_dir.join("dereplicated.skmer.zst");
|
||||
if !in_path.exists() {
|
||||
return;
|
||||
}
|
||||
let out_path = part_dir.join("unitig.fasta.gz");
|
||||
|
||||
let mut g = GraphDeBruijn::new(k);
|
||||
|
||||
let mphf_path = part_dir.join("mphf1.bin");
|
||||
let counts_path = part_dir.join("counts1.bin");
|
||||
let filter_active = (args.min_abundance > 1 || args.max_abundance.is_some())
|
||||
&& mphf_path.exists()
|
||||
&& counts_path.exists();
|
||||
|
||||
let mphf_opt: Option<GOFunction> = if filter_active {
|
||||
let mut f = File::open(&mphf_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(GOFunction::read(&mut f).unwrap_or_else(|e| {
|
||||
eprintln!("error reading MPHF {}: {e}", mphf_path.display());
|
||||
std::process::exit(1)
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_mmap_opt = if filter_active {
|
||||
let cf = File::open(&counts_path).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
Some(unsafe {
|
||||
memmap2::Mmap::map(&cf).unwrap_or_else(|e| {
|
||||
eprintln!("error mmapping {}: {e}", counts_path.display());
|
||||
std::process::exit(1)
|
||||
})
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let counts_slice: Option<&[u32]> = counts_mmap_opt.as_ref().map(|m| unsafe {
|
||||
std::slice::from_raw_parts(m.as_ptr() as *const u32, m.len() / 4)
|
||||
});
|
||||
|
||||
let mut reader = SKFileReader::open(&in_path, k).unwrap_or_else(|e| {
|
||||
eprintln!("error opening {}: {e}", in_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
for sk in reader.iter() {
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
let accept = match (&mphf_opt, counts_slice) {
|
||||
(Some(mphf), Some(counts)) => {
|
||||
if let Some(slot) = mphf.get(&kmer) {
|
||||
let ab = counts[slot as usize];
|
||||
ab >= args.min_abundance
|
||||
&& args.max_abundance.map_or(true, |max| ab <= max)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
_ => true,
|
||||
};
|
||||
if accept {
|
||||
g.push(kmer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let n_kmers = g.len();
|
||||
total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
|
||||
info!("partition {i}/{n}: {n_kmers} canonical k-mers → {}", out_path.display());
|
||||
|
||||
g.compute_degrees();
|
||||
|
||||
let file = File::create(&out_path).unwrap_or_else(|e| {
|
||||
eprintln!("error creating {}: {e}", out_path.display());
|
||||
std::process::exit(1)
|
||||
});
|
||||
let mut writer = niffler::send::get_writer(Box::new(file), Format::Gzip, Level::Six)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("error creating gzip writer: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
g.write_fasta(&mut writer).unwrap_or_else(|e| {
|
||||
eprintln!("write error on partition {i}: {e}");
|
||||
std::process::exit(1)
|
||||
});
|
||||
});
|
||||
|
||||
info!(
|
||||
"done — {} total canonical k-mers across all partitions",
|
||||
total_kmers.load(Ordering::Relaxed)
|
||||
);
|
||||
}
|
||||
@@ -19,6 +19,10 @@ enum Commands {
|
||||
Partition(cmd::partition::PartitionArgs),
|
||||
/// Count kmers from an existing dereplicated partition directory
|
||||
Count(cmd::count::CountArgs),
|
||||
/// Export partition data to FASTA (--super-kmers: dereplicated super-kmers)
|
||||
Fasta(cmd::fasta::FastaArgs),
|
||||
/// Build de Bruijn unitigs for all partitions and write to unitig.fasta.gz
|
||||
Unitig(cmd::unitig::UnitigArgs),
|
||||
}
|
||||
|
||||
fn main() {
|
||||
@@ -41,6 +45,8 @@ fn main() {
|
||||
Commands::Superkmer(args) => cmd::superkmer::run(args),
|
||||
Commands::Partition(args) => cmd::partition::run(args),
|
||||
Commands::Count(args) => cmd::count::run(args),
|
||||
Commands::Fasta(args) => cmd::fasta::run(args),
|
||||
Commands::Unitig(args) => cmd::unitig::run(args),
|
||||
}
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
|
||||
@@ -15,6 +15,7 @@ use remove_dir_all::remove_dir_all;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
use obiskio::{SKFileMeta, SKFileReader, SKFileWriter, SKResult};
|
||||
use rayon::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -102,8 +103,8 @@ impl KmerPartition {
|
||||
.into());
|
||||
}
|
||||
let meta_path = root_path.join(META_FILENAME);
|
||||
let meta: PartitionMeta = serde_json::from_reader(fs::File::open(&meta_path)?)
|
||||
.map_err(io::Error::other)?;
|
||||
let meta: PartitionMeta =
|
||||
serde_json::from_reader(fs::File::open(&meta_path)?).map_err(io::Error::other)?;
|
||||
|
||||
let level = level_from_u32(meta.level);
|
||||
let n_partitions = 1usize << meta.n_bits;
|
||||
@@ -120,19 +121,21 @@ impl KmerPartition {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn write(&mut self, sk: &mut SuperKmer) -> SKResult<()> {
|
||||
/// Route and write one super-kmer to its partition file.
|
||||
pub fn write(&mut self, rsk: RoutableSuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)
|
||||
}
|
||||
|
||||
pub fn write_batch(&mut self, sks: &mut [SuperKmer]) -> SKResult<()> {
|
||||
/// Route and write a batch of super-kmers.
|
||||
pub fn write_batch(&mut self, rsks: Vec<RoutableSuperKmer>) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
let partition = self.partition_of(sk)?;
|
||||
sk.init_count();
|
||||
self.ensure_writer(partition)?.write(sk)?;
|
||||
for rsk in rsks {
|
||||
let partition = (rsk.minimizer().hash(self.minimizer_size) & self.partitions_mask) as usize;
|
||||
let sk = rsk.into_superkmer();
|
||||
self.ensure_writer(partition)?.write(&sk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -164,6 +167,18 @@ impl KmerPartition {
|
||||
&self.root_path
|
||||
}
|
||||
|
||||
pub fn kmer_size(&self) -> usize {
|
||||
self.kmer_size
|
||||
}
|
||||
|
||||
pub fn minimizer_size(&self) -> usize {
|
||||
self.minimizer_size
|
||||
}
|
||||
|
||||
pub fn n_partitions(&self) -> usize {
|
||||
self.n_partitions
|
||||
}
|
||||
|
||||
/// Deduplicate all `raw.{ext}` files in parallel, replacing each with a
|
||||
/// `dereplicated.{ext}` file where identical canonical sequences are merged
|
||||
/// and their counts summed.
|
||||
@@ -185,6 +200,7 @@ impl KmerPartition {
|
||||
/// more temporary file descriptors — all managed by the global fd pool.
|
||||
pub fn dereplicate(&self) -> SKResult<()> {
|
||||
let level = self.level;
|
||||
let k = self.kmer_size;
|
||||
let root = &self.root_path;
|
||||
let sys = System::new_all();
|
||||
// available_memory() can return 0 on macOS when the compressor page count exceeds
|
||||
@@ -205,7 +221,7 @@ impl KmerPartition {
|
||||
}
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let n_buckets = optimal_buckets(&raw_path, available_per_thread);
|
||||
dereplicate_partition(&dir, level, n_buckets)
|
||||
dereplicate_partition(&dir, level, n_buckets, k)
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -270,8 +286,10 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
let global_spectrum_map: BTreeMap<String, u64> =
|
||||
global_spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let global_spectrum_map: BTreeMap<String, u64> = global_spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(root.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": global_f0, "f1": global_f1, "spectrum": &global_spectrum_map }),
|
||||
@@ -291,14 +309,6 @@ impl KmerPartition {
|
||||
}
|
||||
}
|
||||
|
||||
fn partition_of(&self, sk: &SuperKmer) -> SKResult<usize> {
|
||||
let minimizer = sk
|
||||
.kmer(sk.minimizer_pos() as usize, self.minimizer_size)
|
||||
.map_err(|e| io::Error::other(e))?
|
||||
.canonical(self.minimizer_size);
|
||||
Ok((minimizer.hash(self.minimizer_size) & self.partitions_mask) as usize)
|
||||
}
|
||||
|
||||
fn write_meta(&self, n_bits: usize) -> SKResult<()> {
|
||||
let meta = PartitionMeta {
|
||||
n_bits,
|
||||
@@ -316,7 +326,8 @@ impl KmerPartition {
|
||||
let dir = self.root_path.join(format!("part_{:05}", partition));
|
||||
fs::create_dir_all(&dir)?;
|
||||
let file_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
let writer = SKFileWriter::create_with(file_path, Format::Zstd, self.level)?;
|
||||
let writer =
|
||||
SKFileWriter::create_with(file_path, self.kmer_size, Format::Zstd, self.level)?;
|
||||
self.writers[partition] = Some(writer);
|
||||
}
|
||||
Ok(self.writers[partition].as_mut().unwrap())
|
||||
@@ -373,33 +384,47 @@ fn optimal_buckets(raw_path: &Path, available_bytes: u64) -> usize {
|
||||
|
||||
fn level_from_u32(n: u32) -> Level {
|
||||
match n {
|
||||
0 => Level::Zero, 1 => Level::One, 2 => Level::Two, 3 => Level::Three,
|
||||
4 => Level::Four, 5 => Level::Five, 6 => Level::Six, 7 => Level::Seven,
|
||||
8 => Level::Eight, 9 => Level::Nine, 10 => Level::Ten, 11 => Level::Eleven,
|
||||
12 => Level::Twelve, 13 => Level::Thirteen, 14 => Level::Fourteen,
|
||||
15 => Level::Fifteen, 16 => Level::Sixteen, 17 => Level::Seventeen,
|
||||
18 => Level::Eighteen, 19 => Level::Nineteen, 20 => Level::Twenty,
|
||||
0 => Level::Zero,
|
||||
1 => Level::One,
|
||||
2 => Level::Two,
|
||||
3 => Level::Three,
|
||||
4 => Level::Four,
|
||||
5 => Level::Five,
|
||||
6 => Level::Six,
|
||||
7 => Level::Seven,
|
||||
8 => Level::Eight,
|
||||
9 => Level::Nine,
|
||||
10 => Level::Ten,
|
||||
11 => Level::Eleven,
|
||||
12 => Level::Twelve,
|
||||
13 => Level::Thirteen,
|
||||
14 => Level::Fourteen,
|
||||
15 => Level::Fifteen,
|
||||
16 => Level::Sixteen,
|
||||
17 => Level::Seventeen,
|
||||
18 => Level::Eighteen,
|
||||
19 => Level::Nineteen,
|
||||
20 => Level::Twenty,
|
||||
_ => Level::TwentyOne,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// Maximum value that fits in the 24-bit COUNT field of a SuperKmer header.
|
||||
const MAX_SK_COUNT: u64 = (1 << 24) - 1;
|
||||
|
||||
/// Deduplicate one partition directory in place (two-phase split + merge).
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()> {
|
||||
fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize, k: usize) -> SKResult<()> {
|
||||
let raw_path = dir.join(format!("raw.{SK_EXT}"));
|
||||
if !raw_path.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let out_path = dir.join(format!("dereplicated.{SK_EXT}"));
|
||||
let mut writer = SKFileWriter::create_with(&out_path, Format::Zstd, level)?;
|
||||
let mut writer = SKFileWriter::create_with(&out_path, k, Format::Zstd, level)?;
|
||||
|
||||
if n_temp == 1 {
|
||||
// ── Direct path: partition fits in memory, no split needed ────────────
|
||||
let map = load_bucket(&raw_path)?;
|
||||
let map = load_bucket(&raw_path, k)?;
|
||||
remove_skmer_file(&raw_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
} else {
|
||||
@@ -412,10 +437,10 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
{
|
||||
let mut writers: Vec<SKFileWriter> = temp_paths
|
||||
.iter()
|
||||
.map(|p| SKFileWriter::create_with(p, Format::Zstd, level))
|
||||
.map(|p| SKFileWriter::create_with(p, k, Format::Zstd, level))
|
||||
.collect::<SKResult<_>>()?;
|
||||
|
||||
let mut reader = SKFileReader::open(&raw_path)?;
|
||||
let mut reader = SKFileReader::open(&raw_path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let bucket = (sk.hash() & temp_mask) as usize;
|
||||
@@ -429,7 +454,7 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
|
||||
// ── Phase 2: merge each temp bucket into the output ───────────────────
|
||||
for temp_path in &temp_paths {
|
||||
let map = load_bucket(temp_path)?;
|
||||
let map = load_bucket(temp_path, k)?;
|
||||
remove_skmer_file(temp_path)?;
|
||||
flush_map(map, &mut writer)?;
|
||||
}
|
||||
@@ -440,14 +465,14 @@ fn dereplicate_partition(dir: &Path, level: Level, n_temp: usize) -> SKResult<()
|
||||
}
|
||||
|
||||
/// Read a SuperKmer file into a deduplication map (already canonical).
|
||||
fn load_bucket(path: &Path) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
fn load_bucket(path: &Path, k: usize) -> SKResult<HashMap<SuperKmer, u64>> {
|
||||
let capacity = SKFileMeta::read(path)
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|m| m.instances as usize)
|
||||
.unwrap_or(0);
|
||||
let mut map: HashMap<SuperKmer, u64> = HashMap::with_capacity(capacity);
|
||||
let mut reader = SKFileReader::open(path)?;
|
||||
let mut reader = SKFileReader::open(path, k)?;
|
||||
while let Some(mut sk) = reader.read()? {
|
||||
sk.canonical();
|
||||
let count = sk.count() as u64;
|
||||
@@ -487,7 +512,7 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let mut seen: HashSet<Kmer> = HashSet::with_capacity(capacity);
|
||||
let mut pass1_superkmers: u64 = 0;
|
||||
{
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass1_superkmers += 1;
|
||||
for kmer in sk.iter_canonical_kmers(k) {
|
||||
@@ -497,7 +522,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
}
|
||||
let kmers: Vec<Kmer> = seen.into_iter().collect();
|
||||
let n_kmers = kmers.len();
|
||||
debug!("{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}", dir.display());
|
||||
debug!(
|
||||
"{}: pass1 superkmers={pass1_superkmers} unique_kmers={n_kmers}",
|
||||
dir.display()
|
||||
);
|
||||
|
||||
if n_kmers == 0 {
|
||||
return Ok(());
|
||||
@@ -527,13 +555,16 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
{
|
||||
let counts =
|
||||
unsafe { std::slice::from_raw_parts_mut(mmap.as_mut_ptr() as *mut u32, n_kmers) };
|
||||
let mut reader = SKFileReader::open(dedup_path)?;
|
||||
let mut reader = SKFileReader::open(dedup_path, k)?;
|
||||
while let Some(sk) = reader.read()? {
|
||||
pass2_superkmers += 1;
|
||||
let seql = sk.seql();
|
||||
let seql = sk.len();
|
||||
let sk_count = sk.count();
|
||||
if pass2_superkmers <= 3 {
|
||||
debug!("{}: sk#{pass2_superkmers} seql={seql} count={sk_count}", dir.display());
|
||||
debug!(
|
||||
"{}: sk#{pass2_superkmers} seql={seql} count={sk_count}",
|
||||
dir.display()
|
||||
);
|
||||
}
|
||||
if seql < k {
|
||||
continue;
|
||||
@@ -566,8 +597,10 @@ fn count_partition(dir: &Path, dedup_path: &Path, k: usize) -> SKResult<()> {
|
||||
let f0 = n_kmers as u64;
|
||||
let f1: u64 = spectrum.iter().map(|(&c, &f)| c as u64 * f).sum();
|
||||
|
||||
let spectrum_map: BTreeMap<String, u64> =
|
||||
spectrum.iter().map(|(&c, &f)| (format!("{c:010}"), f)).collect();
|
||||
let spectrum_map: BTreeMap<String, u64> = spectrum
|
||||
.iter()
|
||||
.map(|(&c, &f)| (format!("{c:010}"), f))
|
||||
.collect();
|
||||
serde_json::to_writer_pretty(
|
||||
fs::File::create(dir.join("kmer_spectrum_raw.json"))?,
|
||||
&serde_json::json!({ "f0": f0, "f1": f1, "spectrum": &spectrum_map }),
|
||||
|
||||
@@ -5,6 +5,8 @@ edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
bitvec = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0.149"
|
||||
xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -40,7 +40,7 @@ fn bench_write_ascii(c: &mut Criterion) {
|
||||
let mut buf = Vec::with_capacity(len);
|
||||
b.iter(|| {
|
||||
buf.clear();
|
||||
std::hint::black_box(sk).write_ascii(&mut buf);
|
||||
std::hint::black_box(sk).write_ascii(&mut buf).unwrap();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
use serde::Serialize;
|
||||
use serde_json;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// Serialize `self` as a single-line JSON object into a writer.
|
||||
pub trait Annotation: Serialize {
|
||||
/// Write the annotation as compact JSON into `writer`.
|
||||
fn write<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let s = serde_json::to_string(self).map_err(io::Error::other)?;
|
||||
writer.write_all(s.as_bytes())
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! The low 64−2k bits are always zero. k is not stored — it is a parameter of
|
||||
//! every operation that needs it, and will be owned by the collection-level indexer.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
|
||||
// ── KmerError ─────────────────────────────────────────────────────────────────
|
||||
@@ -115,24 +117,24 @@ impl Kmer {
|
||||
#[inline]
|
||||
pub fn to_ascii(&self, k: usize) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(k);
|
||||
self.write_ascii(k, &mut buf);
|
||||
self.write_ascii(k, &mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode this kmer into ASCII nucleotides, appending into `buf`.
|
||||
/// Zero allocation — caller owns the buffer.
|
||||
/// Decode this kmer into ASCII nucleotides, writing into `writer`.
|
||||
#[inline]
|
||||
pub fn write_ascii(&self, k: usize, buf: &mut Vec<u8>) {
|
||||
pub fn write_ascii<W: Write>(&self, k: usize, writer: &mut W) -> io::Result<()> {
|
||||
let bytes = self.0.to_be_bytes();
|
||||
let full = k / 4;
|
||||
let rem = k % 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[bytes[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[bytes[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
if rem > 0 {
|
||||
let decoded = DEC4[bytes[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&decoded[..rem]);
|
||||
writer.write_all(&decoded[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the reverse complement of this kmer.
|
||||
|
||||
@@ -5,8 +5,17 @@
|
||||
|
||||
#![deny(missing_docs)]
|
||||
|
||||
mod annotations;
|
||||
|
||||
mod encoding;
|
||||
pub mod kmer;
|
||||
mod revcomp_lookup;
|
||||
/// Routable super-kmer: canonical sequence paired with its minimizer for scatter routing.
|
||||
pub mod routable;
|
||||
pub mod superkmer;
|
||||
|
||||
pub mod unitig;
|
||||
|
||||
pub use annotations::Annotation;
|
||||
pub use routable::RoutableSuperKmer;
|
||||
pub use superkmer::SuperKmer;
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
//! Super-kmer with routing metadata: canonical sequence + pre-computed minimizer.
|
||||
|
||||
use super::kmer::Kmer;
|
||||
use super::SuperKmer;
|
||||
|
||||
/// Owned wrapper that pairs a canonical [`SuperKmer`] with its minimizer [`Kmer`].
|
||||
///
|
||||
/// Created at the single point where raw sequence bytes are emitted from the
|
||||
/// scratch buffer. The minimizer position (given in original orientation) is
|
||||
/// adjusted for any flip applied during canonicalisation. After routing, call
|
||||
/// [`into_superkmer`] to discard the metadata and continue with the bare sequence.
|
||||
///
|
||||
/// [`into_superkmer`]: RoutableSuperKmer::into_superkmer
|
||||
pub struct RoutableSuperKmer {
|
||||
superkmer: SuperKmer,
|
||||
minimizer: Kmer,
|
||||
}
|
||||
|
||||
impl RoutableSuperKmer {
|
||||
/// Construct from raw packed bytes.
|
||||
///
|
||||
/// `min_pos` is the 0-based minimizer position in the **original** (pre-flip)
|
||||
/// orientation. `m` is the minimizer length. `seql` and `seq` are the
|
||||
/// raw length byte and 2-bit-packed nucleotides as produced by the scratch
|
||||
/// buffer.
|
||||
pub fn build(min_pos: usize, m: usize, seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let (sk, already_canonical) = SuperKmer::build(seql, seq);
|
||||
let adjusted_pos = if already_canonical {
|
||||
min_pos
|
||||
} else {
|
||||
sk.len() - m - min_pos
|
||||
};
|
||||
let minimizer = sk.kmer(adjusted_pos, m).unwrap().canonical(m);
|
||||
Self {
|
||||
superkmer: sk,
|
||||
minimizer,
|
||||
}
|
||||
}
|
||||
|
||||
/// Borrow the canonical super-kmer sequence.
|
||||
pub fn superkmer(&self) -> &SuperKmer {
|
||||
&self.superkmer
|
||||
}
|
||||
|
||||
/// Borrow the canonical minimizer kmer.
|
||||
pub fn minimizer(&self) -> &Kmer {
|
||||
&self.minimizer
|
||||
}
|
||||
|
||||
/// Consume this wrapper and return the inner [`SuperKmer`].
|
||||
pub fn into_superkmer(self) -> SuperKmer {
|
||||
self.superkmer
|
||||
}
|
||||
|
||||
/// Sequence length in nucleotides.
|
||||
pub fn len(&self) -> usize {
|
||||
self.superkmer.len()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,5 @@
|
||||
pub trait Sequence {
|
||||
fn len(&self) -> usize;
|
||||
fn sequence(&self) -> &[u8];
|
||||
fn revcomp(&self) -> Self;
|
||||
}
|
||||
+54
-587
@@ -1,4 +1,7 @@
|
||||
//! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
|
||||
use std::io::{self, Write};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
@@ -14,70 +17,24 @@ use xxhash_rust::xxh3::xxh3_64;
|
||||
///
|
||||
/// ```text
|
||||
/// [31 .......... 8] [7 ...... 0]
|
||||
/// payload (24 b) SEQL (8 b)
|
||||
/// count (24 b) SEQL (8 b)
|
||||
/// ```
|
||||
///
|
||||
/// SEQL encodes the sequence length: 1–255 map directly; 0 encodes 256.
|
||||
///
|
||||
/// # Temporal dual-use of the payload field
|
||||
///
|
||||
/// The 24-bit payload field serves two distinct roles that are **never active
|
||||
/// at the same time**, separated by the routing step of the scatter pipeline:
|
||||
///
|
||||
/// | Phase | Bits [15:8] | Bits [31:16] |
|
||||
/// |---|---|---|
|
||||
/// | **Scatter** (before routing) | minimizer start position (0–255) | unused (zero) |
|
||||
/// | **Count** (after routing) | low byte of occurrence count | high bytes of occurrence count |
|
||||
///
|
||||
/// During scatter, [`set_minimizer_pos`] stores the 0-based position of the
|
||||
/// minimizer's first nucleotide within the super-kmer. At routing time,
|
||||
/// [`init_count`] overwrites the entire payload with `1`, marking the
|
||||
/// super-kmer as seen once and enabling the usual [`increment`] / [`add`] /
|
||||
/// [`set_count`] operations during deduplication.
|
||||
///
|
||||
/// [`set_minimizer_pos`]: SuperKmerHeader::set_minimizer_pos
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
/// [`increment`]: SuperKmerHeader::increment
|
||||
/// [`add`]: SuperKmerHeader::add
|
||||
/// [`set_count`]: SuperKmerHeader::set_count
|
||||
/// The count field starts at 1 and accumulates occurrence counts during
|
||||
/// deduplication.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub(crate) struct SuperKmerHeader(u32);
|
||||
|
||||
impl SuperKmerHeader {
|
||||
pub(crate) fn new(seql: u8) -> Self {
|
||||
Self(seql as u32)
|
||||
Self((1 << 8) | seql as u32)
|
||||
}
|
||||
|
||||
fn seql(&self) -> u8 {
|
||||
self.0 as u8
|
||||
}
|
||||
|
||||
// ── scatter phase ─────────────────────────────────────────────────────────
|
||||
|
||||
/// Store the minimizer start position (bits [15:8]).
|
||||
/// Only meaningful during the scatter phase, before [`init_count`].
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.0 = (self.0 & 0xFF) | ((pos as u32) << 8);
|
||||
}
|
||||
|
||||
/// Return the minimizer start position stored during scatter.
|
||||
/// Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmerHeader::init_count
|
||||
fn minimizer_pos(&self) -> u8 {
|
||||
(self.0 >> 8) as u8
|
||||
}
|
||||
|
||||
// ── count phase ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Transition from scatter to count phase: set occurrence count to 1.
|
||||
/// Overwrites the minimizer position stored in the payload.
|
||||
fn init_count(&mut self) {
|
||||
self.0 = (self.0 & 0xFF) | (1 << 8);
|
||||
}
|
||||
|
||||
fn count(&self) -> u32 {
|
||||
self.0 >> 8
|
||||
}
|
||||
@@ -95,6 +52,15 @@ impl SuperKmerHeader {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct CountAnnotation {
|
||||
seq_length: usize,
|
||||
kmer_size: usize,
|
||||
minimizer_size: usize,
|
||||
partition: u32,
|
||||
count: u32,
|
||||
}
|
||||
|
||||
// ── SuperKmer ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Canonical super-kmer: 32-bit header followed by a byte-aligned 2-bit nucleotide sequence.
|
||||
@@ -127,12 +93,18 @@ impl std::hash::Hash for SuperKmer {
|
||||
impl SuperKmer {
|
||||
/// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
|
||||
pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
Self::build(seql, seq).0
|
||||
}
|
||||
|
||||
/// Construct and canonicalise in place, returning `(sk, already_canonical)`.
|
||||
/// `already_canonical` is `true` when the sequence was not flipped.
|
||||
pub fn build(seql: u8, seq: Box<[u8]>) -> (Self, bool) {
|
||||
let mut sk = Self {
|
||||
header: SuperKmerHeader::new(seql),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
let already_canonical = sk.canonical(); // true = pas retourné
|
||||
(sk, already_canonical)
|
||||
}
|
||||
|
||||
/// Deserialise from a raw 32-bit header word and packed sequence bytes.
|
||||
@@ -141,14 +113,19 @@ impl SuperKmer {
|
||||
let seql = (bits & 0xFF) as u8;
|
||||
let len = stored_to_len(seql);
|
||||
debug_assert_eq!(seq.len(), byte_len(len));
|
||||
Self {
|
||||
let sk = Self {
|
||||
header: SuperKmerHeader(bits),
|
||||
seq,
|
||||
}
|
||||
};
|
||||
debug_assert!(
|
||||
sk.is_canonical(),
|
||||
"SuperKmer deserialised from disk is not canonical"
|
||||
);
|
||||
sk
|
||||
}
|
||||
|
||||
/// Returns the sequence length in nucleotides (1–256).
|
||||
pub fn seql(&self) -> usize {
|
||||
pub fn len(&self) -> usize {
|
||||
stored_to_len(self.header.seql())
|
||||
}
|
||||
|
||||
@@ -172,44 +149,6 @@ impl SuperKmer {
|
||||
self.header.set_count(n);
|
||||
}
|
||||
|
||||
// ── scatter / routing interface ───────────────────────────────────────────
|
||||
|
||||
/// Store the 0-based position of the minimizer's first nucleotide within
|
||||
/// this super-kmer.
|
||||
///
|
||||
/// **Scatter phase only.** Must be called before [`init_count`].
|
||||
/// The position is encoded in the payload field that later holds the
|
||||
/// occurrence count; the two uses are mutually exclusive by pipeline phase.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn set_minimizer_pos(&mut self, pos: u8) {
|
||||
self.header.set_minimizer_pos(pos);
|
||||
}
|
||||
|
||||
/// Return the stored minimizer start position.
|
||||
///
|
||||
/// **Scatter phase only.** Only meaningful before [`init_count`] is called.
|
||||
///
|
||||
/// [`init_count`]: SuperKmer::init_count
|
||||
pub fn minimizer_pos(&self) -> u8 {
|
||||
self.header.minimizer_pos()
|
||||
}
|
||||
|
||||
/// Transition from scatter phase to count phase: set occurrence count to 1.
|
||||
///
|
||||
/// Call this once at routing time. After this call, [`minimizer_pos`] is
|
||||
/// no longer valid and the count methods ([`count`], [`increment`], [`add`],
|
||||
/// [`set_count`]) become meaningful.
|
||||
///
|
||||
/// [`minimizer_pos`]: SuperKmer::minimizer_pos
|
||||
/// [`count`]: SuperKmer::count
|
||||
/// [`increment`]: SuperKmer::increment
|
||||
/// [`add`]: SuperKmer::add
|
||||
/// [`set_count`]: SuperKmer::set_count
|
||||
pub fn init_count(&mut self) {
|
||||
self.header.init_count();
|
||||
}
|
||||
|
||||
/// Extract nucleotide i (0-based from 5' end) as a 2-bit value.
|
||||
pub fn nucleotide(&self, i: usize) -> u8 {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
@@ -217,7 +156,7 @@ impl SuperKmer {
|
||||
|
||||
/// Reverse-complement this super-kmer in place.
|
||||
pub fn revcomp(&mut self) {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
let n = byte_len(seql);
|
||||
|
||||
// Step 1: swap bytes outside-in, applying revcomp4 to each.
|
||||
@@ -245,8 +184,7 @@ impl SuperKmer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a new SuperKmer.
|
||||
/// The result is not yet in canonical form; call `.canonical()` if needed.
|
||||
/// Encode an ASCII nucleotide sequence (ACGT, length 1–256) into a canonical SuperKmer.
|
||||
pub fn from_ascii(ascii: &[u8]) -> Self {
|
||||
let seql = ascii.len();
|
||||
debug_assert!(
|
||||
@@ -275,25 +213,26 @@ impl SuperKmer {
|
||||
Self::new(seql as u8, seq.into_boxed_slice()) // 256usize as u8 == 0, intentional
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
let seql = self.seql();
|
||||
/// Decode this super-kmer sequence into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let seql = self.len();
|
||||
let full = seql / 4;
|
||||
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode this super-kmer sequence into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql());
|
||||
self.write_ascii(&mut buf);
|
||||
let mut buf = Vec::with_capacity(self.len());
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -318,7 +257,7 @@ impl SuperKmer {
|
||||
if k == 0 || k > 32 {
|
||||
return Err(KmerError::InvalidK { k });
|
||||
}
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
if i + k > seql {
|
||||
return Err(KmerError::OutOfBounds {
|
||||
position: i,
|
||||
@@ -351,7 +290,7 @@ impl SuperKmer {
|
||||
|
||||
/// Returns `true` if this super-kmer is in canonical form (lexicographic minimum of forward and revcomp).
|
||||
pub fn is_canonical(&self) -> bool {
|
||||
let seql = self.seql();
|
||||
let seql = self.len();
|
||||
for i in 0..seql {
|
||||
let fwd = self.nucleotide(i);
|
||||
let rev = complement(self.nucleotide(seql - 1 - i));
|
||||
@@ -398,14 +337,18 @@ struct SKKmerIter<'a> {
|
||||
|
||||
impl<'a> SKKmerIter<'a> {
|
||||
fn new(skmer: &'a SuperKmer, k: usize) -> Self {
|
||||
let seql = skmer.seql();
|
||||
let seql = skmer.len();
|
||||
let lshift = 64 - k * 2;
|
||||
let mask = ((!0u128) << (lshift + 2)) as u64;
|
||||
Self {
|
||||
skmer,
|
||||
mask,
|
||||
lshift,
|
||||
current: if seql >= k { skmer.kmer(0, k).unwrap().raw() } else { 0 },
|
||||
current: if seql >= k {
|
||||
skmer.kmer(0, k).unwrap().raw()
|
||||
} else {
|
||||
0
|
||||
},
|
||||
pos: k,
|
||||
max_pos: seql,
|
||||
}
|
||||
@@ -449,482 +392,6 @@ fn stored_to_len(s: u8) -> usize {
|
||||
if s == 0 { 256 } else { s as usize }
|
||||
}
|
||||
|
||||
// ── tests ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.seql() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_zero() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 42);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 50);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.seql(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.seql(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.seql(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.seql(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.seql(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'G'), (b'T', b'T')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── scatter / routing lifecycle ───────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_roundtrip() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(42);
|
||||
assert_eq!(sk.minimizer_pos(), 42);
|
||||
assert_eq!(sk.seql(), 8, "set_minimizer_pos altered seql");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_boundary_values() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(0);
|
||||
assert_eq!(sk.minimizer_pos(), 0);
|
||||
sk.set_minimizer_pos(255);
|
||||
assert_eq!(sk.minimizer_pos(), 255);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_resets_to_one_and_enables_counting() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
sk.set_minimizer_pos(7);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.count(), 1);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.add(10);
|
||||
assert_eq!(sk.count(), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn init_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_minimizer_pos(0);
|
||||
sk.init_count();
|
||||
assert_eq!(sk.seql(), len, "init_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimizer_pos_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_minimizer_pos(3);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
#[path = "tests/superkmer.rs"]
|
||||
mod tests;
|
||||
|
||||
@@ -0,0 +1,425 @@
|
||||
use super::*;
|
||||
|
||||
/// Repeating ACGT pattern of the given length.
|
||||
fn make_seq(len: usize) -> Vec<u8> {
|
||||
(0..len).map(|i| b"ACGT"[i % 4]).collect()
|
||||
}
|
||||
|
||||
/// Reference revcomp on ASCII bytes.
|
||||
fn ascii_revcomp(seq: &[u8]) -> Vec<u8> {
|
||||
seq.iter()
|
||||
.rev()
|
||||
.map(|&b| match b {
|
||||
b'A' => b'T',
|
||||
b'T' => b'A',
|
||||
b'C' => b'G',
|
||||
b'G' => b'C',
|
||||
_ => b'A',
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn all_lengths() -> impl Iterator<Item = usize> {
|
||||
(1..=9).chain([255, 256])
|
||||
}
|
||||
|
||||
// ── kmer extraction ───────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn kmer_first_matches_from_ascii() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmer = sk.kmer(0, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[..k], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_last_position() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let seql = ascii.len();
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmer = sk.kmer(seql - k, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[seql - k..], k).unwrap();
|
||||
assert_eq!(kmer, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_all_positions() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let k = 4;
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for i in 0..=ascii.len() - k {
|
||||
let kmer = sk.kmer(i, k).unwrap();
|
||||
let expected = crate::kmer::Kmer::from_ascii(&ascii[i..i + k], k).unwrap();
|
||||
assert_eq!(kmer, expected, "mismatch at position {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_out_of_bounds() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(2, 4).is_err()); // 2 + 4 > 4
|
||||
assert!(sk.kmer(4, 1).is_err()); // 4 + 1 > 4
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn kmer_invalid_k() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.kmer(0, 0).is_err());
|
||||
assert!(sk.kmer(0, 33).is_err());
|
||||
}
|
||||
|
||||
// ── canonical_kmer ────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_is_min_of_kmer_and_revcomp() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGTACGT");
|
||||
let k = 4;
|
||||
for i in 0..=(sk.len() - k) {
|
||||
let ck = sk.canonical_kmer(i, k).unwrap();
|
||||
let fwd = sk.kmer(i, k).unwrap();
|
||||
assert_eq!(ck, fwd.canonical(k));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_palindrome_unchanged() {
|
||||
// ACGT is its own reverse complement
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let fwd = sk.kmer(0, 4).unwrap();
|
||||
assert_eq!(ck, fwd);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_tttt_becomes_aaaa() {
|
||||
let sk = SuperKmer::from_ascii(b"TTTT");
|
||||
let ck = sk.canonical_kmer(0, 4).unwrap();
|
||||
let expected = Kmer::from_ascii(b"AAAA", 4).unwrap();
|
||||
assert_eq!(ck, expected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_kmer_errors_propagate() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert!(sk.canonical_kmer(2, 4).is_err()); // out of bounds
|
||||
assert!(sk.canonical_kmer(0, 0).is_err()); // invalid k
|
||||
}
|
||||
|
||||
// ── count ─────────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn count_starts_at_one() {
|
||||
let sk = SuperKmer::from_ascii(b"ACGT");
|
||||
assert_eq!(sk.count(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_adds_one() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 2);
|
||||
sk.increment();
|
||||
assert_eq!(sk.count(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_increases_count() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(42);
|
||||
assert_eq!(sk.count(), 43);
|
||||
sk.add(8);
|
||||
assert_eq!(sk.count(), 51);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_overwrites() {
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.add(100);
|
||||
sk.set_count(7);
|
||||
assert_eq!(sk.count(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn increment_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.increment();
|
||||
assert_eq!(sk.len(), len, "increment altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.add(1000);
|
||||
assert_eq!(sk.len(), len, "add altered seql for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn set_count_preserves_seql() {
|
||||
for len in all_lengths() {
|
||||
let mut sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
sk.set_count(999);
|
||||
assert_eq!(sk.len(), len, "set_count altered seql for len={len}");
|
||||
assert_eq!(sk.count(), 999);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn count_does_not_affect_sequence() {
|
||||
let ascii = b"ACGTACGT".to_vec();
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.set_count(16_000_000);
|
||||
assert_eq!(sk.to_ascii(), ascii);
|
||||
}
|
||||
|
||||
// ── seql encoding ─────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn seql_roundtrip() {
|
||||
for len in all_lengths() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(len));
|
||||
assert_eq!(sk.len(), len, "seql() wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn seql_256_stored_as_zero() {
|
||||
let sk = SuperKmer::from_ascii(&make_seq(256));
|
||||
assert_eq!(sk.header.seql(), 0u8);
|
||||
assert_eq!(sk.len(), 256);
|
||||
}
|
||||
|
||||
// ── from_ascii / to_ascii roundtrip ───────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), ascii, "roundtrip failed for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_roundtrip_all_bases() {
|
||||
// Canonical form: min(seq, revcomp). G×4 flips to C×4, T×4 flips to A×4.
|
||||
for (base, expected) in [(b'A', b'A'), (b'C', b'C'), (b'G', b'C'), (b'T', b'A')] {
|
||||
let ascii = vec![base; 4];
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
assert_eq!(sk.to_ascii(), vec![expected; 4]);
|
||||
}
|
||||
}
|
||||
|
||||
// ── revcomp correctness ───────────────────────────────────────────────────
|
||||
|
||||
/// Known (seq, expected_revcomp) pairs — one per shift value × two byte counts.
|
||||
#[test]
|
||||
fn revcomp_known_values() {
|
||||
let cases = [
|
||||
// shift=6
|
||||
("A", "T"),
|
||||
("ACGTA", "TACGT"),
|
||||
// shift=4
|
||||
("AC", "GT"),
|
||||
("ACGTAC", "GTACGT"),
|
||||
// shift=2
|
||||
("ACG", "CGT"),
|
||||
("ACGTACG", "CGTACGT"),
|
||||
// shift=0
|
||||
("ACGT", "ACGT"),
|
||||
("ACGTACGT", "ACGTACGT"),
|
||||
];
|
||||
for (seq, expected) in cases {
|
||||
let mut sk = SuperKmer::from_ascii(seq.as_bytes());
|
||||
sk.revcomp();
|
||||
assert_eq!(
|
||||
sk.to_ascii(),
|
||||
expected.as_bytes(),
|
||||
"revcomp wrong for \"{seq}\""
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_vs_reference_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let expected = ascii_revcomp(&ascii);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), expected, "revcomp wrong for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn revcomp_involution_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.revcomp();
|
||||
sk.revcomp();
|
||||
assert_eq!(sk.to_ascii(), ascii, "revcomp∘revcomp≠id for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── canonical ─────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn canonical_palindrome_unchanged() {
|
||||
// ACGT is its own revcomp
|
||||
let mut sk = SuperKmer::from_ascii(b"ACGT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"ACGT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_forward() {
|
||||
// "AAAA" < "TTTT" → stays as-is
|
||||
let mut sk = SuperKmer::from_ascii(b"AAAA");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_chooses_revcomp() {
|
||||
// "TTTT" > "AAAA" → flipped
|
||||
let mut sk = SuperKmer::from_ascii(b"TTTT");
|
||||
sk.canonical();
|
||||
assert_eq!(sk.to_ascii(), b"AAAA");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn canonical_is_minimal_all_lengths() {
|
||||
for len in all_lengths() {
|
||||
let ascii = make_seq(len);
|
||||
let mut sk = SuperKmer::from_ascii(&ascii);
|
||||
sk.canonical();
|
||||
let fwd = sk.to_ascii();
|
||||
let rev = ascii_revcomp(&fwd);
|
||||
assert!(fwd <= rev, "canonical not minimal for len={len}");
|
||||
}
|
||||
}
|
||||
|
||||
// ── iter_kmers ────────────────────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_count() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in [1usize, 3, 4, 5, 8, 12] {
|
||||
let n = sk.iter_kmers(k).count();
|
||||
assert_eq!(n, ascii.len() - k + 1, "count mismatch for k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_first_is_kmer_0() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
for k in 1..=ascii.len() {
|
||||
let first = sk.iter_kmers(k).next().unwrap();
|
||||
assert_eq!(first, sk.kmer(0, k).unwrap(), "k={k}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_matches_kmer_at_each_position() {
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 4;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "mismatch at pos {i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_single_when_seql_eq_k() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len();
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 1);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_two_when_seql_eq_k_plus_one() {
|
||||
let ascii = b"ACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = ascii.len() - 1;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), 2);
|
||||
assert_eq!(kmers[0], sk.kmer(0, k).unwrap());
|
||||
assert_eq!(kmers[1], sk.kmer(1, k).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_all_k_values() {
|
||||
// For every valid k, each yielded kmer must match kmer(i, k).
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let seql = ascii.len();
|
||||
for k in 1..=seql {
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), seql - k + 1, "k={k}");
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "k={k}, pos={i}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_crosses_byte_boundary() {
|
||||
// Positions 3→4 and 7→8 cross a 4-nucleotide byte boundary.
|
||||
let ascii = b"ACGTACGTACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let k = 3;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
for boundary in [3usize, 4, 7, 8] {
|
||||
if boundary + 1 < kmers.len() {
|
||||
assert_eq!(
|
||||
kmers[boundary],
|
||||
sk.kmer(boundary, k).unwrap(),
|
||||
"pos={boundary}"
|
||||
);
|
||||
assert_eq!(
|
||||
kmers[boundary + 1],
|
||||
sk.kmer(boundary + 1, k).unwrap(),
|
||||
"pos={}",
|
||||
boundary + 1
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_k1_yields_all_nucleotides() {
|
||||
let ascii = b"ACGT";
|
||||
let sk = SuperKmer::from_ascii(ascii);
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(1).collect();
|
||||
assert_eq!(kmers.len(), 4);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, 1).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iter_kmers_long_sequence() {
|
||||
let ascii = make_seq(20);
|
||||
let sk = SuperKmer::from_ascii(&ascii);
|
||||
let k = 7;
|
||||
let kmers: Vec<Kmer> = sk.iter_kmers(k).collect();
|
||||
assert_eq!(kmers.len(), ascii.len() - k + 1);
|
||||
for (i, &km) in kmers.iter().enumerate() {
|
||||
assert_eq!(km, sk.kmer(i, k).unwrap(), "pos={i}");
|
||||
}
|
||||
}
|
||||
@@ -4,6 +4,8 @@
|
||||
//! at the MSB of `seq[0]`, 4 bases per byte — but without the 256-nucleotide
|
||||
//! length cap and without the scatter/count header payload.
|
||||
|
||||
use std::io::{self, Write};
|
||||
|
||||
use crate::encoding::{DEC4, encode_base};
|
||||
use crate::kmer::{Kmer, KmerError};
|
||||
use crate::revcomp_lookup::REVCOMP4;
|
||||
@@ -101,23 +103,24 @@ impl Unitig {
|
||||
(self.seq[i / 4] >> (6 - 2 * (i % 4))) & 0b11
|
||||
}
|
||||
|
||||
/// Decode into ASCII nucleotides, appending into `buf`.
|
||||
pub fn write_ascii(&self, buf: &mut Vec<u8>) {
|
||||
/// Decode into ASCII nucleotides, writing into `writer`.
|
||||
pub fn write_ascii<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let full = self.seql / 4;
|
||||
for i in 0..full {
|
||||
buf.extend_from_slice(&DEC4[self.seq[i] as usize].to_be_bytes());
|
||||
writer.write_all(&DEC4[self.seq[i] as usize].to_be_bytes())?;
|
||||
}
|
||||
let rem = self.seql % 4;
|
||||
if rem > 0 {
|
||||
let bytes = DEC4[self.seq[full] as usize].to_be_bytes();
|
||||
buf.extend_from_slice(&bytes[..rem]);
|
||||
writer.write_all(&bytes[..rem])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Decode into a fresh ASCII `Vec<u8>`.
|
||||
pub fn to_ascii(&self) -> Vec<u8> {
|
||||
let mut buf = Vec::with_capacity(self.seql);
|
||||
self.write_ascii(&mut buf);
|
||||
self.write_ascii(&mut buf).unwrap();
|
||||
buf
|
||||
}
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@
|
||||
//! | super-kmer length = 256| k |
|
||||
|
||||
use obikrope::{ForwardCursor, Rope, RopeCursor};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
use crate::rolling_stat::RollingStat;
|
||||
use crate::scratch::SuperKmerScratch;
|
||||
|
||||
/// Iterator over `(minimizer_hash, SuperKmer)` pairs.
|
||||
/// Iterator over [`RoutableSuperKmer`] values.
|
||||
pub struct SuperKmerIter<'a> {
|
||||
cursor: ForwardCursor<'a>,
|
||||
k: usize,
|
||||
@@ -60,26 +60,19 @@ impl<'a> SuperKmerIter<'a> {
|
||||
self.prev_min_pos = 0;
|
||||
}
|
||||
|
||||
fn try_emit(&mut self) -> Option<SuperKmer> {
|
||||
fn try_emit(&mut self) -> Option<RoutableSuperKmer> {
|
||||
if self.scratch.len() < self.k {
|
||||
return None;
|
||||
}
|
||||
let min = self.prev_min?;
|
||||
let mut sk = self.scratch.emit();
|
||||
let min_pos = if sk.canonical() {
|
||||
self.prev_min_pos
|
||||
} else {
|
||||
sk.seql() - self.m - self.prev_min_pos
|
||||
};
|
||||
sk.set_minimizer_pos(min_pos as u8);
|
||||
Some(sk)
|
||||
self.prev_min?;
|
||||
Some(self.scratch.emit(self.prev_min_pos, self.m))
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SuperKmerIter<'_> {
|
||||
type Item = SuperKmer;
|
||||
type Item = RoutableSuperKmer;
|
||||
|
||||
fn next(&mut self) -> Option<SuperKmer> {
|
||||
fn next(&mut self) -> Option<RoutableSuperKmer> {
|
||||
loop {
|
||||
let byte = match self.cursor.read_next().ok() {
|
||||
None => {
|
||||
@@ -164,7 +157,7 @@ mod tests {
|
||||
fn run_nofilter(data: &[u8], k: usize, m: usize) -> Vec<Vec<u8>> {
|
||||
let rope = make_rope(data);
|
||||
SuperKmerIter::new(&rope, k, m, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -205,7 +198,7 @@ mod tests {
|
||||
|
||||
let rope = make_rope(b"AAAAAAAAAAAAAAAAAAAA\x00");
|
||||
let out_reject: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 6, 0.9)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(out_reject.is_empty());
|
||||
}
|
||||
@@ -218,7 +211,7 @@ mod tests {
|
||||
rope.push(data[..mid].to_vec());
|
||||
rope.push(data[mid..].to_vec());
|
||||
let out: Vec<Vec<u8>> = SuperKmerIter::new(&rope, K, M, 1, 0.0)
|
||||
.map(|sk| sk.to_ascii())
|
||||
.map(|rsk| rsk.superkmer().to_ascii())
|
||||
.collect();
|
||||
assert!(!out.is_empty());
|
||||
}
|
||||
@@ -226,7 +219,7 @@ mod tests {
|
||||
#[test]
|
||||
fn yields_minimizer_value() {
|
||||
let rope = make_rope(b"ACGTACGTACGTACGTACGT\x00");
|
||||
let results: Vec<SuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
let results: Vec<RoutableSuperKmer> = SuperKmerIter::new(&rope, K, M, 1, 0.0).collect();
|
||||
assert!(!results.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,9 +16,9 @@ pub use iter::SuperKmerIter;
|
||||
pub use scratch::SuperKmerScratch;
|
||||
|
||||
use obikrope::Rope;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Collect all super-kmers from a normalised rope chunk.
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<SuperKmer> {
|
||||
pub fn build_superkmers(rope: Rope, k: usize, m: usize, level_max: usize, theta: f64) -> Vec<RoutableSuperKmer> {
|
||||
SuperKmerIter::new(&rope, k, m, level_max, theta).collect()
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Stack-allocated scratch buffer for building a SuperKmer before heap emission.
|
||||
|
||||
use crate::encoding::{BYTE_LEN_MAX, encode_nuc};
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use obikseq::RoutableSuperKmer;
|
||||
|
||||
/// Maximum nucleotides in a super-kmer (fits one `u64` segment window, kept ≤ 256).
|
||||
pub const MAX_SUPERKMER_LEN: usize = 256;
|
||||
@@ -56,16 +56,15 @@ impl SuperKmerScratch {
|
||||
///
|
||||
/// The heap allocation (`Box<[u8]>`) is exactly sized to the sequence.
|
||||
/// Resets the buffer to empty afterward.
|
||||
pub fn emit(&mut self) -> SuperKmer {
|
||||
pub fn emit(&mut self, min_pos: usize, m: usize) -> RoutableSuperKmer {
|
||||
let seql = self.len;
|
||||
debug_assert!(seql >= 1 && seql <= MAX_SUPERKMER_LEN);
|
||||
let n = (seql + 3) / 4;
|
||||
let seq: Box<[u8]> = self.buf[..n].into();
|
||||
self.buf[..n].fill(0);
|
||||
self.len = 0;
|
||||
SuperKmer::new(seql as u8, seq)
|
||||
RoutableSuperKmer::build(min_pos, m, seql as u8, seq)
|
||||
}
|
||||
|
||||
/// Discard all accumulated nucleotides without producing a [`SuperKmer`].
|
||||
pub fn reset(&mut self) {
|
||||
let n = (self.len + 3) / 4;
|
||||
|
||||
+33
-15
@@ -2,17 +2,25 @@ use obikseq::superkmer::SuperKmer;
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Serialise one SuperKmer into `w` (uncompressed; caller must wrap with a compressor).
|
||||
///
|
||||
/// Bits [7:0] of the header store `n_kmers = seql - k + 1` (kmer units, 1–255),
|
||||
/// not the raw nucleotide length. This removes the 0=256 wrapping convention.
|
||||
#[inline]
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer) -> io::Result<()> {
|
||||
w.write_all(&sk.header_bits().to_le_bytes())?;
|
||||
pub(crate) fn write_superkmer<W: Write>(w: &mut W, sk: &SuperKmer, k: usize) -> io::Result<()> {
|
||||
let n_kmers = sk.len() - k + 1;
|
||||
let new_bits = (sk.header_bits() & !0xFF) | (n_kmers as u32);
|
||||
w.write_all(&new_bits.to_le_bytes())?;
|
||||
w.write_all(sk.seq_bytes())
|
||||
}
|
||||
|
||||
/// Deserialise one SuperKmer from `r`. Returns `None` on clean EOF.
|
||||
/// `seq_buf` is a reusable scratch buffer to avoid per-record allocation.
|
||||
/// Bits [7:0] of the on-disk header contain `n_kmers`; nucleotide length is
|
||||
/// reconstructed as `n_kmers + k - 1`.
|
||||
pub(crate) fn read_superkmer<R: Read>(
|
||||
r: &mut R,
|
||||
seq_buf: &mut Vec<u8>,
|
||||
k: usize,
|
||||
) -> io::Result<Option<SuperKmer>> {
|
||||
let mut hdr = [0u8; 4];
|
||||
match r.read_exact(&mut hdr) {
|
||||
@@ -21,12 +29,18 @@ pub(crate) fn read_superkmer<R: Read>(
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
let bits = u32::from_le_bytes(hdr);
|
||||
let seql_byte = (bits & 0xFF) as u8;
|
||||
let nt_len: usize = if seql_byte == 0 { 256 } else { seql_byte as usize };
|
||||
let n_kmers = (bits & 0xFF) as usize;
|
||||
let nt_len = n_kmers + k - 1;
|
||||
let byte_len = (nt_len + 3) / 4;
|
||||
seq_buf.resize(byte_len, 0);
|
||||
r.read_exact(seq_buf)?;
|
||||
Ok(Some(SuperKmer::from_header_bits(bits, seq_buf.as_slice().into())))
|
||||
// Reconstruct the in-memory seql byte (0 encodes 256, 1-255 direct).
|
||||
let seql_byte = if nt_len == 256 { 0u8 } else { nt_len as u8 };
|
||||
let mem_bits = (bits & !0xFF) | (seql_byte as u32);
|
||||
Ok(Some(SuperKmer::from_header_bits(
|
||||
mem_bits,
|
||||
seq_buf.as_slice().into(),
|
||||
)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -40,28 +54,31 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn roundtrip_single() {
|
||||
let k = 4;
|
||||
let sk = make_sk(b"ACGTACGT");
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii());
|
||||
assert_eq!(sk.seql(), got.seql());
|
||||
assert_eq!(sk.len(), got.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn roundtrip_all_lengths() {
|
||||
let bases: Vec<u8> = (0..256).map(|i| b"ACGT"[i % 4]).collect();
|
||||
for len in (1..=9).chain([255, 256]) {
|
||||
// k=11 is the project minimum; test seql from k to 256.
|
||||
let k = 11;
|
||||
for len in (k..=k + 8).chain([255, 256]) {
|
||||
let sk = make_sk(&bases[..len]);
|
||||
let mut buf = Vec::new();
|
||||
write_superkmer(&mut buf, &sk).unwrap();
|
||||
write_superkmer(&mut buf, &sk, k).unwrap();
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
assert_eq!(sk.to_ascii(), got.to_ascii(), "len={len}");
|
||||
}
|
||||
}
|
||||
@@ -71,24 +88,25 @@ mod tests {
|
||||
let buf: Vec<u8> = vec![];
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, 4).unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_records() {
|
||||
let k = 4;
|
||||
let seqs: &[&[u8]] = &[b"AAAA", b"CCCC", b"GGGG", b"TTTT"];
|
||||
let mut buf = Vec::new();
|
||||
for s in seqs {
|
||||
write_superkmer(&mut buf, &make_sk(s)).unwrap();
|
||||
write_superkmer(&mut buf, &make_sk(s), k).unwrap();
|
||||
}
|
||||
|
||||
let mut cur = Cursor::new(&buf);
|
||||
let mut seq_buf = Vec::new();
|
||||
for s in seqs {
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf).unwrap().unwrap();
|
||||
let got = read_superkmer(&mut cur, &mut seq_buf, k).unwrap().unwrap();
|
||||
let expected = make_sk(s);
|
||||
assert_eq!(expected.to_ascii(), got.to_ascii());
|
||||
}
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf).unwrap().is_none());
|
||||
assert!(read_superkmer(&mut cur, &mut seq_buf, k).unwrap().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
+63
-35
@@ -3,8 +3,8 @@ use crate::error::SKResult;
|
||||
use crate::limits::max_concurrent_files;
|
||||
use crate::meta::SKFileMeta;
|
||||
use lru::LruCache;
|
||||
use niffler::send::compression::Format;
|
||||
use niffler::Level;
|
||||
use niffler::send::compression::Format;
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{BufWriter, Write};
|
||||
@@ -79,7 +79,11 @@ impl SKFilePool {
|
||||
/// Create a pool allowing at most `max_open` simultaneously open fds.
|
||||
pub fn new(max_open: usize) -> Self {
|
||||
let cap = NonZeroUsize::new(max_open.max(1)).unwrap();
|
||||
Self { max_open, entries: Vec::new(), open: LruCache::new(cap) }
|
||||
Self {
|
||||
max_open,
|
||||
entries: Vec::new(),
|
||||
open: LruCache::new(cap),
|
||||
}
|
||||
}
|
||||
|
||||
/// Derive pool size from the OS fd limit (75 %, clamped to `[16, MAX_POOL_SIZE]`).
|
||||
@@ -218,6 +222,7 @@ pub struct SKFileWriter {
|
||||
id: usize,
|
||||
pool: Arc<Mutex<SKFilePool>>,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
pending: Vec<u8>,
|
||||
flush_threshold: usize,
|
||||
logically_closed: bool,
|
||||
@@ -225,14 +230,15 @@ pub struct SKFileWriter {
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file (Zstd, level 3).
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, Format::Zstd, Level::Three)
|
||||
pub fn create_token(pool: &SharedPool, path: PathBuf, k: usize) -> SKResult<SKFileWriter> {
|
||||
create_token_with(pool, path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a `SKFileWriter` for a new file with explicit format and level.
|
||||
pub fn create_token_with(
|
||||
pool: &SharedPool,
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<SKFileWriter> {
|
||||
@@ -241,6 +247,7 @@ pub fn create_token_with(
|
||||
id,
|
||||
pool: Arc::clone(pool),
|
||||
path,
|
||||
k,
|
||||
pending: Vec::with_capacity(DEFAULT_FLUSH_THRESHOLD + 128),
|
||||
flush_threshold: DEFAULT_FLUSH_THRESHOLD,
|
||||
logically_closed: false,
|
||||
@@ -251,13 +258,18 @@ pub fn create_token_with(
|
||||
impl SKFileWriter {
|
||||
/// Create a standalone file writer (Zstd, level 3).
|
||||
/// The pool is created internally and is not accessible to the caller.
|
||||
pub fn create<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
Self::create_with(path, Format::Zstd, Level::Three)
|
||||
pub fn create<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
Self::create_with(path, k, Format::Zstd, Level::Three)
|
||||
}
|
||||
|
||||
/// Create a standalone file writer with explicit format and level.
|
||||
pub fn create_with<P: AsRef<Path>>(path: P, format: Format, level: Level) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), format, level)
|
||||
pub fn create_with<P: AsRef<Path>>(
|
||||
path: P,
|
||||
k: usize,
|
||||
format: Format,
|
||||
level: Level,
|
||||
) -> SKResult<Self> {
|
||||
create_token_with(global_pool(), path.as_ref().to_owned(), k, format, level)
|
||||
}
|
||||
|
||||
/// `true` if the underlying fd is currently open in the pool.
|
||||
@@ -268,10 +280,10 @@ impl SKFileWriter {
|
||||
/// Accumulate one SuperKmer. Drains to fd when `pending ≥ flush_threshold`.
|
||||
pub fn write(&mut self, sk: &SuperKmer) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -282,10 +294,10 @@ impl SKFileWriter {
|
||||
pub fn write_batch(&mut self, sks: &[SuperKmer]) -> SKResult<()> {
|
||||
self.check_not_closed()?;
|
||||
for sk in sks {
|
||||
write_superkmer(&mut self.pending, sk)?;
|
||||
write_superkmer(&mut self.pending, sk, self.k)?;
|
||||
self.meta.instances += 1;
|
||||
self.meta.count_sum += sk.count() as u64;
|
||||
self.meta.length_sum += sk.seql() as u64;
|
||||
self.meta.length_sum += sk.len() as u64;
|
||||
if self.pending.len() >= self.flush_threshold {
|
||||
self.drain()?;
|
||||
}
|
||||
@@ -339,7 +351,10 @@ impl SKFileWriter {
|
||||
}
|
||||
|
||||
if !self.pending.is_empty() {
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
self.pending.clear();
|
||||
}
|
||||
if let Some(mut w) = fd_guard.take() {
|
||||
@@ -400,7 +415,10 @@ impl SKFileWriter {
|
||||
fd_guard = fd_arc.lock().unwrap(); // acquire fd lock under pool lock
|
||||
// pool drops here → pool lock released, fd lock still held
|
||||
}
|
||||
fd_guard.as_mut().expect("fd open after ensure_open").write_all(&self.pending)?;
|
||||
fd_guard
|
||||
.as_mut()
|
||||
.expect("fd open after ensure_open")
|
||||
.write_all(&self.pending)?;
|
||||
// fd_guard drops → entry fd lock released
|
||||
self.pending.clear();
|
||||
Ok(())
|
||||
@@ -424,6 +442,8 @@ mod tests {
|
||||
use obikseq::superkmer::SuperKmer;
|
||||
use tempfile::{NamedTempFile, TempDir};
|
||||
|
||||
const TEST_K: usize = 4;
|
||||
|
||||
fn make_sk(seed: usize) -> SuperKmer {
|
||||
let bases: Vec<u8> = (0..8).map(|j| b"ACGT"[(seed + j) % 4]).collect();
|
||||
SuperKmer::from_ascii(&bases)
|
||||
@@ -443,7 +463,7 @@ mod tests {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(3);
|
||||
for i in 0..10 {
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap();
|
||||
create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap();
|
||||
}
|
||||
assert_eq!(p.lock().unwrap().open_count(), 0);
|
||||
}
|
||||
@@ -455,14 +475,18 @@ mod tests {
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> = (0..6)
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst"))).unwrap())
|
||||
.map(|i| create_token(&p, dir.path().join(format!("p{i}.zst")), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for t in tokens.iter_mut() {
|
||||
open_token(t, &sk);
|
||||
}
|
||||
|
||||
assert!(p.lock().unwrap().open_count() <= 3, "open={}", p.lock().unwrap().open_count());
|
||||
assert!(
|
||||
p.lock().unwrap().open_count() <= 3,
|
||||
"open={}",
|
||||
p.lock().unwrap().open_count()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -471,8 +495,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 fd open, pool full
|
||||
open_token(&mut t1, &sk); // evicts t0, t1 fd open
|
||||
@@ -487,8 +511,8 @@ mod tests {
|
||||
let p = pool(1);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
|
||||
t0.set_flush_threshold(1);
|
||||
t0.write(&sk).unwrap(); // t0 fd open, pool full
|
||||
@@ -504,7 +528,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for name in &["a.zst", "b.zst"] {
|
||||
let mut r = SKFileReader::open(dir.path().join(name)).unwrap();
|
||||
let mut r = SKFileReader::open(dir.path().join(name), TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1, "{name}: expected 1 record");
|
||||
}
|
||||
@@ -516,9 +540,9 @@ mod tests {
|
||||
let p = pool(2);
|
||||
let sk = make_sk(0);
|
||||
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst")).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst")).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst")).unwrap();
|
||||
let mut t0 = create_token(&p, dir.path().join("a.zst"), TEST_K).unwrap();
|
||||
let mut t1 = create_token(&p, dir.path().join("b.zst"), TEST_K).unwrap();
|
||||
let mut t2 = create_token(&p, dir.path().join("c.zst"), TEST_K).unwrap();
|
||||
|
||||
open_token(&mut t0, &sk); // t0 open
|
||||
open_token(&mut t1, &sk); // t1 open, t0 LRU
|
||||
@@ -538,10 +562,14 @@ mod tests {
|
||||
fn close_all_produces_readable_files() {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let p = pool(8);
|
||||
let paths: Vec<_> = (0..4).map(|i| dir.path().join(format!("{i}.zst"))).collect();
|
||||
let paths: Vec<_> = (0..4)
|
||||
.map(|i| dir.path().join(format!("{i}.zst")))
|
||||
.collect();
|
||||
|
||||
let mut tokens: Vec<SKFileWriter> =
|
||||
paths.iter().map(|path| create_token(&p, path.clone()).unwrap()).collect();
|
||||
let mut tokens: Vec<SKFileWriter> = paths
|
||||
.iter()
|
||||
.map(|path| create_token(&p, path.clone(), TEST_K).unwrap())
|
||||
.collect();
|
||||
|
||||
for (i, t) in tokens.iter_mut().enumerate() {
|
||||
t.write(&make_sk(i)).unwrap();
|
||||
@@ -553,7 +581,7 @@ mod tests {
|
||||
p.lock().unwrap().close_all().unwrap();
|
||||
|
||||
for path in &paths {
|
||||
let mut r = SKFileReader::open(path).unwrap();
|
||||
let mut r = SKFileReader::open(path, TEST_K).unwrap();
|
||||
let got = r.read_batch(10).unwrap();
|
||||
assert_eq!(got.len(), 1);
|
||||
}
|
||||
@@ -566,11 +594,11 @@ mod tests {
|
||||
let sks: Vec<_> = (0..50).map(make_sk).collect();
|
||||
let path = dir.path().join("batch.zst");
|
||||
|
||||
let mut t = create_token(&p, path.clone()).unwrap();
|
||||
let mut t = create_token(&p, path.clone(), TEST_K).unwrap();
|
||||
t.write_batch(&sks).unwrap();
|
||||
t.close().unwrap();
|
||||
|
||||
let mut r = SKFileReader::open(&path).unwrap();
|
||||
let mut r = SKFileReader::open(&path, TEST_K).unwrap();
|
||||
let got = r.read_batch(100).unwrap();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -590,11 +618,11 @@ mod tests {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let sks: Vec<_> = (0..100).map(make_sk).collect();
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
w.close().unwrap();
|
||||
}
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got = r.read_batch(200).unwrap();
|
||||
assert_eq!(got.len(), 100);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -605,7 +633,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_close_prevents_write() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.close().unwrap();
|
||||
assert!(!w.is_open());
|
||||
assert!(w.write(&make_sk(0)).is_err());
|
||||
@@ -614,7 +642,7 @@ mod tests {
|
||||
#[test]
|
||||
fn standalone_is_physically_open() {
|
||||
let tmp = NamedTempFile::new().unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
assert!(!w.is_physically_open()); // fd deferred until first drain
|
||||
w.set_flush_threshold(1);
|
||||
w.write(&make_sk(0)).unwrap(); // triggers drain → fd opened
|
||||
|
||||
@@ -15,6 +15,7 @@ use std::path::{Path, PathBuf};
|
||||
/// that it can fast-forward on next open.
|
||||
pub struct SKFileReader {
|
||||
path: PathBuf,
|
||||
k: usize,
|
||||
reader: Option<Box<dyn std::io::Read + Send>>,
|
||||
/// Reusable scratch buffer for the `seq` bytes of each record.
|
||||
seq_buf: Vec<u8>,
|
||||
@@ -24,11 +25,13 @@ pub struct SKFileReader {
|
||||
|
||||
impl SKFileReader {
|
||||
/// Open a file for reading. Format is auto-detected from magic bytes.
|
||||
pub fn open<P: AsRef<Path>>(path: P) -> SKResult<Self> {
|
||||
/// `k` is the kmer size of the partition; required to decode the on-disk n_kmers field.
|
||||
pub fn open<P: AsRef<Path>>(path: P, k: usize) -> SKResult<Self> {
|
||||
let path = path.as_ref().to_owned();
|
||||
let (reader, _fmt) = niffler::send::get_reader(Box::new(BufReader::new(File::open(&path)?)))?;
|
||||
Ok(Self {
|
||||
path,
|
||||
k,
|
||||
reader: Some(reader),
|
||||
seq_buf: Vec::with_capacity(64),
|
||||
consumed: 0,
|
||||
@@ -43,7 +46,7 @@ impl SKFileReader {
|
||||
"read from physically closed SKFileReader",
|
||||
)
|
||||
})?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf)?;
|
||||
let result = read_superkmer(r, &mut self.seq_buf, self.k)?;
|
||||
if result.is_some() {
|
||||
self.consumed += 1;
|
||||
}
|
||||
@@ -100,7 +103,7 @@ impl SKFileReader {
|
||||
let target = self.consumed;
|
||||
self.consumed = 0;
|
||||
for _ in 0..target {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf)? {
|
||||
match read_superkmer(self.reader.as_mut().unwrap(), &mut self.seq_buf, self.k)? {
|
||||
Some(_) => self.consumed += 1,
|
||||
None => break,
|
||||
}
|
||||
@@ -147,6 +150,8 @@ mod tests {
|
||||
use crate::pool::SKFileWriter;
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
const TEST_K: usize = 4; // test sequences are 8 bases; k=4 gives n_kmers=5
|
||||
|
||||
fn make_sks(n: usize) -> Vec<SuperKmer> {
|
||||
(0..n)
|
||||
.map(|i| {
|
||||
@@ -162,11 +167,11 @@ mod tests {
|
||||
let sks = make_sks(50);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
let got: Vec<_> = r.iter().collect();
|
||||
assert_eq!(got.len(), 50);
|
||||
for (a, b) in sks.iter().zip(got.iter()) {
|
||||
@@ -180,11 +185,11 @@ mod tests {
|
||||
let sks = make_sks(20);
|
||||
|
||||
{
|
||||
let mut w = SKFileWriter::create(tmp.path()).unwrap();
|
||||
let mut w = SKFileWriter::create(tmp.path(), TEST_K).unwrap();
|
||||
w.write_batch(&sks).unwrap();
|
||||
}
|
||||
|
||||
let mut r = SKFileReader::open(tmp.path()).unwrap();
|
||||
let mut r = SKFileReader::open(tmp.path(), TEST_K).unwrap();
|
||||
// Read 10, then simulate pool eviction + re-access
|
||||
let first = r.read_batch(10).unwrap();
|
||||
r.close();
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
Signature: 8a477f597d28d172789f06886806bc55
|
||||
# This file is a cache directory tag created by cargo.
|
||||
# For information about cache directory tags see https://bford.info/cachedir/
|
||||
Binary file not shown.
@@ -1 +0,0 @@
|
||||
This file has an mtime of when this was started.
|
||||
@@ -1 +0,0 @@
|
||||
3b14e2b3d799d099
|
||||
Binary file not shown.
@@ -1 +0,0 @@
|
||||
This file has an mtime of when this was started.
|
||||
@@ -1 +0,0 @@
|
||||
dd7fdbdd12639eb8
|
||||
Binary file not shown.
@@ -1,5 +0,0 @@
|
||||
/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/obikseq-4791c70657a715c0.d: obikseq/src/lib.rs
|
||||
|
||||
/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/libobikseq-4791c70657a715c0.rmeta: obikseq/src/lib.rs
|
||||
|
||||
obikseq/src/lib.rs:
|
||||
@@ -1,5 +0,0 @@
|
||||
/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/obikseq-5cc47015be91e3b1.d: obikseq/src/lib.rs
|
||||
|
||||
/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/libobikseq-5cc47015be91e3b1.rmeta: obikseq/src/lib.rs
|
||||
|
||||
obikseq/src/lib.rs:
|
||||
BIN
Binary file not shown.
@@ -1,55 +0,0 @@
|
||||
warning: virtual workspace defaulting to `resolver = "1"` despite one or more workspace members being on edition 2021 which implies `resolver = "2"`
|
||||
|
|
||||
= note: to keep the current resolver, specify `workspace.resolver = "1"` in the workspace root's manifest
|
||||
= note: to use the edition 2021 resolver, specify `workspace.resolver = "2"` in the workspace root's manifest
|
||||
= note: for more details see https://doc.rust-lang.org/cargo/reference/resolver.html#resolver-versions
|
||||
0.005139042s INFO prepare_target{force=false package_id=obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq) target="obikseq"}: cargo::core::compiler::fingerprint: fingerprint error for obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq)/Check { test: false }/TargetInner { name_inferred: true, ..: lib_target("obikseq", ["lib"], "/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/src/lib.rs", Edition2021) }
|
||||
0.005345417s INFO prepare_target{force=false package_id=obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq) target="obikseq"}: cargo::core::compiler::fingerprint: err: failed to read `/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/.fingerprint/obikseq-4791c70657a715c0/lib-obikseq`
|
||||
|
||||
Caused by:
|
||||
No such file or directory (os error 2)
|
||||
|
||||
Stack backtrace:
|
||||
0: std::backtrace::Backtrace::create
|
||||
1: cargo_util::paths::read_bytes
|
||||
2: cargo_util::paths::read
|
||||
3: cargo::core::compiler::fingerprint::_compare_old_fingerprint
|
||||
4: cargo::core::compiler::fingerprint::prepare_target
|
||||
5: cargo::core::compiler::compile
|
||||
6: <cargo::core::compiler::build_runner::BuildRunner>::compile
|
||||
7: cargo::ops::cargo_compile::compile_ws
|
||||
8: cargo::ops::cargo_compile::compile_with_exec
|
||||
9: cargo::ops::cargo_compile::compile
|
||||
10: cargo::commands::check::exec
|
||||
11: <cargo::cli::Exec>::exec
|
||||
12: cargo::main
|
||||
13: std::sys::backtrace::__rust_begin_short_backtrace::<fn(), ()>
|
||||
14: std::rt::lang_start::<()>::{closure#0}
|
||||
15: std::rt::lang_start_internal
|
||||
16: _main
|
||||
0.016672292s INFO prepare_target{force=false package_id=obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq) target="obikseq"}: cargo::core::compiler::fingerprint: fingerprint error for obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq)/Check { test: true }/TargetInner { name_inferred: true, ..: lib_target("obikseq", ["lib"], "/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/src/lib.rs", Edition2021) }
|
||||
0.016685583s INFO prepare_target{force=false package_id=obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq) target="obikseq"}: cargo::core::compiler::fingerprint: err: failed to read `/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/.fingerprint/obikseq-5cc47015be91e3b1/test-lib-obikseq`
|
||||
|
||||
Caused by:
|
||||
No such file or directory (os error 2)
|
||||
|
||||
Stack backtrace:
|
||||
0: std::backtrace::Backtrace::create
|
||||
1: cargo_util::paths::read_bytes
|
||||
2: cargo_util::paths::read
|
||||
3: cargo::core::compiler::fingerprint::_compare_old_fingerprint
|
||||
4: cargo::core::compiler::fingerprint::prepare_target
|
||||
5: cargo::core::compiler::compile
|
||||
6: <cargo::core::compiler::build_runner::BuildRunner>::compile
|
||||
7: cargo::ops::cargo_compile::compile_ws
|
||||
8: cargo::ops::cargo_compile::compile_with_exec
|
||||
9: cargo::ops::cargo_compile::compile
|
||||
10: cargo::commands::check::exec
|
||||
11: <cargo::cli::Exec>::exec
|
||||
12: cargo::main
|
||||
13: std::sys::backtrace::__rust_begin_short_backtrace::<fn(), ()>
|
||||
14: std::rt::lang_start::<()>::{closure#0}
|
||||
15: std::rt::lang_start_internal
|
||||
16: _main
|
||||
Checking obikseq v0.1.0 (/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq)
|
||||
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.06s
|
||||
@@ -1,3 +0,0 @@
|
||||
{"reason":"compiler-artifact","package_id":"path+file:///Users/coissac/Sync/travail/__MOI__/obikmer/obikseq#0.1.0","manifest_path":"/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/Cargo.toml","target":{"kind":["lib"],"crate_types":["lib"],"name":"obikseq","src_path":"/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/src/lib.rs","edition":"2021","doc":true,"doctest":true,"test":true},"profile":{"opt_level":"0","debuginfo":2,"debug_assertions":true,"overflow_checks":true,"test":false},"features":[],"filenames":["/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/libobikseq-4791c70657a715c0.rmeta"],"executable":null,"fresh":false}
|
||||
{"reason":"compiler-artifact","package_id":"path+file:///Users/coissac/Sync/travail/__MOI__/obikmer/obikseq#0.1.0","manifest_path":"/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/Cargo.toml","target":{"kind":["lib"],"crate_types":["lib"],"name":"obikseq","src_path":"/Users/coissac/Sync/travail/__MOI__/obikmer/obikseq/src/lib.rs","edition":"2021","doc":true,"doctest":true,"test":true},"profile":{"opt_level":"0","debuginfo":2,"debug_assertions":true,"overflow_checks":true,"test":true},"features":[],"filenames":["/Users/coissac/Sync/travail/__MOI__/obikmer/target/debug/deps/libobikseq-5cc47015be91e3b1.rmeta"],"executable":null,"fresh":false}
|
||||
{"reason":"build-finished","success":true}
|
||||
Reference in New Issue
Block a user