first implementation but far to be optimal
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,963 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../superkmer/">
|
||||
|
||||
|
||||
<link rel="next" href="../chunkreader/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>Kmer - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#kmer-implementation" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmer
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#memory-layout" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Memory layout
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#decoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Decoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#reverse-complement" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Reverse complement
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#canonical-form" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Canonical form
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#memory-layout" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Memory layout
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#decoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Decoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#reverse-complement" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Reverse complement
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#canonical-form" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Canonical form
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="kmer-implementation">Kmer — implementation</h1>
|
||||
<h2 id="memory-layout">Memory layout</h2>
|
||||
<p><code>Kmer</code> is a <code>#[repr(transparent)]</code> newtype over <code>u64</code>:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="cp">#[repr(transparent)]</span>
|
||||
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">Kmer</span><span class="p">(</span><span class="kt">u64</span><span class="p">);</span>
|
||||
</code></pre></div>
|
||||
<p>Nucleotides are packed 2 bits each, <strong>left-aligned</strong>, MSB-first. Nucleotide 0 occupies bits 63–62; nucleotide i occupies bits 63−2i and 62−2i. The low 64−2k bits are always zero. k is <strong>not stored</strong> — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>63–62</th>
|
||||
<th>61–60</th>
|
||||
<th>…</th>
|
||||
<th>63−2(k−1)−1 to 63−2(k−1)</th>
|
||||
<th>63−2k down to 0</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>nt 0</td>
|
||||
<td>nt 1</td>
|
||||
<td>…</td>
|
||||
<td>nt k−1</td>
|
||||
<td>zero padding</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<h2 id="encoding">Encoding</h2>
|
||||
<p><code>Kmer::from_ascii(ascii, k)</code> encodes the first k bytes of an ASCII slice using the shared <code>ENC</code> table (see <a href="../superkmer/#ascii-encoding-and-decoding">SuperKmer — ASCII encoding</a>):</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">for</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="mi">0</span><span class="o">..</span><span class="n">k</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="n">val</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">encode_base</span><span class="p">(</span><span class="n">ascii</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">u64</span><span class="p">;</span>
|
||||
<span class="p">}</span>
|
||||
<span class="n">Kmer</span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
|
||||
</code></pre></div>
|
||||
<p>Zero allocation — result lives on the stack.</p>
|
||||
<h2 id="decoding">Decoding</h2>
|
||||
<p><code>write_ascii(k, buf)</code> appends k ASCII characters to a caller-supplied <code>Vec<u8></code> using the shared <code>DEC4</code> table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.</p>
|
||||
<p><code>to_ascii(k)</code> is a convenience wrapper that allocates and returns a <code>Vec<u8></code>; intended for tests and display only.</p>
|
||||
<h2 id="reverse-complement">Reverse complement</h2>
|
||||
<p>Computed as pure arithmetic — no lookup table, no memory access:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement</span>
|
||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">swap_bytes</span><span class="p">();</span><span class="w"> </span><span class="c1">// reverse bytes</span>
|
||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
|
||||
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
|
||||
<span class="n">Kmer</span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
|
||||
</code></pre></div>
|
||||
<p>After complementing, bytes are reversed (<code>swap_bytes</code>), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.</p>
|
||||
<h2 id="canonical-form">Canonical form</h2>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">canonical</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Self</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">revcomp</span><span class="p">(</span><span class="n">k</span><span class="p">);</span>
|
||||
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o"><=</span><span class="w"> </span><span class="n">rc</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="o">*</span><span class="bp">self</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p>Lexicographic minimum of forward and reverse-complement, comparing the raw <code>u64</code> values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,947 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../storage/">
|
||||
|
||||
|
||||
<link rel="next" href="../../architecture/sequences/invariant/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>MPHF selection - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#mphf-selection-analysis-in-progress" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
MPHF selection
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#candidates" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Candidates
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#space-at-scale" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Space at scale
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#on-disk-and-mmap-considerations" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
On-disk and mmap considerations
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#open-questions" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Open questions
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#candidates" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Candidates
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#space-at-scale" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Space at scale
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#on-disk-and-mmap-considerations" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
On-disk and mmap considerations
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#open-questions" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Open questions
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="mphf-selection-analysis-in-progress">MPHF selection — analysis in progress</h1>
|
||||
<p>The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated.</p>
|
||||
<h2 id="candidates">Candidates</h2>
|
||||
<p><strong>boomphf</strong> (BBHash algorithm, maintained by 10X Genomics):</p>
|
||||
<ul>
|
||||
<li>~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)</li>
|
||||
<li>Parallel construction; well-tested with DNA kmer data at scale</li>
|
||||
<li>Drawback: largest space footprint of the three</li>
|
||||
</ul>
|
||||
<p><strong>ptr_hash</strong> (PtrHash algorithm, Groot Koerkamp, SEA 2025):</p>
|
||||
<ul>
|
||||
<li>~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64 in tight loops) and fastest construction (≥3.1×)</li>
|
||||
<li>Theoretical foundation solid; paper and Rust crate from the same author</li>
|
||||
<li>Drawback: published February 2025 — very young, no production track record</li>
|
||||
</ul>
|
||||
<p><strong>FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
|
||||
<ul>
|
||||
<li>~2.1 bits/key — most compact of the three; good query speed; parallelisable construction</li>
|
||||
<li>More established than ptr_hash; actively maintained</li>
|
||||
<li>Currently preferred candidate</li>
|
||||
</ul>
|
||||
<h2 id="space-at-scale">Space at scale</h2>
|
||||
<p>For 1 024 partitions × 100 M kmers/partition:</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>MPHF</th>
|
||||
<th>bits/key</th>
|
||||
<th>Total MPHF size</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>boomphf</td>
|
||||
<td>3.7</td>
|
||||
<td>~47 GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>ptr_hash</td>
|
||||
<td>2.4</td>
|
||||
<td>~31 GB</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>FMPHGO</td>
|
||||
<td>2.1</td>
|
||||
<td>~27 GB</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 3–30 M kmers → 1–8 MB per MPHF, well within RAM.</p>
|
||||
<h2 id="on-disk-and-mmap-considerations">On-disk and mmap considerations</h2>
|
||||
<p>All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the <code>ph</code> crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 1–8 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.</p>
|
||||
<p>No established Rust crate provides a natively on-disk MPHF. <strong>SSHash</strong> (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.</p>
|
||||
<h2 id="open-questions">Open questions</h2>
|
||||
<ul>
|
||||
<li>Confirm actual partition sizes on representative metagenomic datasets before fixing the choice.</li>
|
||||
<li>Evaluate whether ptr_hash's query speed advantage (2.1–3.3×) justifies adopting a crate that is less than a year old.</li>
|
||||
<li>Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary.</li>
|
||||
<li>Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.</li>
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,946 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../pipeline/">
|
||||
|
||||
|
||||
<link rel="next" href="../mphf/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>On-disk storage - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#on-disk-collection-structure" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
On-disk storage
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#collection-parameters" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Collection parameters
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#count-storage" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Count storage
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#query-protocol" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Query protocol
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#collection-parameters" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Collection parameters
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#count-storage" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Count storage
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#query-protocol" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Query protocol
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="on-disk-collection-structure">On-disk collection structure</h1>
|
||||
<p>Collections are too large to hold in RAM (hundreds of genomes, billions of kmers). The collection lives on disk as a directory of memory-mapped files:</p>
|
||||
<div class="highlight"><pre><span></span><code>collection/
|
||||
metadata.toml — collection parameters (see below)
|
||||
part_XXXX/
|
||||
superkmers.bin.gz — dereplicated super-kmers for this partition (construction artifact)
|
||||
mphf.bin — minimal perfect hash function for this partition
|
||||
counts.bin — packed n-bit count array (or 1-bit presence array)
|
||||
refs.bin — back-references u32 nucleotide offset into unitigs.bin per kmer
|
||||
unitigs.bin — local de Bruijn unitigs (permanent evidence structure)
|
||||
overflow.bin — counts exceeding the packed range (optional)
|
||||
</code></pre></div>
|
||||
<p><code>superkmers.bin.gz</code> is produced during phase 1 and consumed through phases 2–4. It can be deleted after phase 5 — it is not needed for querying. The permanent query structure is <code>mphf.bin + counts.bin + refs.bin + unitigs.bin</code>.</p>
|
||||
<h2 id="collection-parameters">Collection parameters</h2>
|
||||
<p>Stored in <code>metadata.toml</code>:</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>Role</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>k</td>
|
||||
<td>kmer length</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>m</td>
|
||||
<td>minimizer length (odd, < k)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>p</td>
|
||||
<td>partition bits (0 ≤ p ≤ min(14, 2m−16))</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mode</td>
|
||||
<td><code>presence</code> (1 bit/kmer) or <code>count</code> (n bits/kmer)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>n</td>
|
||||
<td>bits per kmer in count mode (chosen at construction)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>min_count</td>
|
||||
<td>singleton filtering threshold (0 = keep all)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>hash_fn</td>
|
||||
<td>hash function identifier</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>hash_seed</td>
|
||||
<td>seed for the hash function</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<h2 id="count-storage">Count storage</h2>
|
||||
<p><strong>refs.bin capacity:</strong> <code>unitigs.bin</code> is a flat 2-bit-packed nucleotide stream with no separators. Each entry in <code>refs.bin</code> is a u32 nucleotide offset pointing to the first base of the kmer. A u32 covers 4 billion nucleotide positions = 1 GB of sequence per partition. In the worst case (all unitigs of length 1 kmer, offsets spaced k apart), this supports 4 billion / k ≈ 130 million kmers per partition at k=31. In the typical case (long unitigs, consecutive kmers at offset +1), the limit approaches 4 billion kmers — well beyond any realistic partition size.</p>
|
||||
<p><em>Presence mode</em> (coverage ≤ 1x, or when only presence/absence matters):</p>
|
||||
<ul>
|
||||
<li><code>counts.bin</code> is a packed 1-bit array — all bits set to 1 for indexed kmers</li>
|
||||
<li>Singletons are the signal, not filtered</li>
|
||||
</ul>
|
||||
<p><em>Count mode</em> (coverage > 1x):</p>
|
||||
<ul>
|
||||
<li><code>counts.bin</code> is a packed n-bit array; n chosen at construction from the observed distribution</li>
|
||||
<li>Value 0: absent sentinel; values 1..2ⁿ−2: direct count; value 2ⁿ−1: overflow</li>
|
||||
<li>Overflow counts stored in a separate <code>overflow.bin</code> as sorted <code>(index: u32, count: u32)</code> pairs</li>
|
||||
<li>Empirically (k=31, 15x coverage): n=5 covers 97% of real kmers, n=6 covers 99%</li>
|
||||
<li>min_count threshold filters low-frequency kmers (errors) before indexing; for ≤1x, min_count=0</li>
|
||||
</ul>
|
||||
<h2 id="query-protocol">Query protocol</h2>
|
||||
<div class="highlight"><pre><span></span><code>query kmer q
|
||||
→ canonical_minimizer(q) → hash → PART → part_XXXX/
|
||||
→ MPHF(q) → index i
|
||||
→ refs[i] = (unitig_id, kmer_offset)
|
||||
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
|
||||
→ match : return counts[i]
|
||||
→ no match: kmer absent
|
||||
</code></pre></div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,986 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../../theory/indexing/">
|
||||
|
||||
|
||||
<link rel="next" href="../kmer/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>SuperKmer - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#superkmer-implementation" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
SuperKmer
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../theory/indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#memory-layout" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Memory layout
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
ASCII encoding and decoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#reverse-complement" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Reverse complement
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmer-extraction" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmer extraction
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#memory-layout" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Memory layout
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
ASCII encoding and decoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#reverse-complement" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Reverse complement
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmer-extraction" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmer extraction
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
|
||||
<h2 id="memory-layout">Memory layout</h2>
|
||||
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Field</th>
|
||||
<th>Bits</th>
|
||||
<th>Role</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>COUNT</td>
|
||||
<td>24</td>
|
||||
<td>Occurrence count (≤ 16 M)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>SEQL</td>
|
||||
<td>8</td>
|
||||
<td>Sequence length in nucleotides (1–256)</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] SEQL</code></p>
|
||||
<p>SEQL is stored as a raw <code>u8</code>: values 1–255 represent lengths 1–255; <strong>0 represents 256</strong> (wrapping convention). The public accessor returns a <code>usize</code> and performs the conversion:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="mi">256</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
|
||||
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span><span class="w"> </span><span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p>The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k−1 overlap, preserving all kmers without duplication.</p>
|
||||
<p>The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.</p>
|
||||
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
|
||||
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
|
||||
<ul>
|
||||
<li><strong><code>ENC: [u8; 32]</code></strong> — indexed by <code>b & 0x1F</code> (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.</li>
|
||||
<li><strong><code>DEC4: [u32; 256]</code></strong> — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian <code>u32</code>. 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.</li>
|
||||
</ul>
|
||||
<p>Encoding 4 nucleotides into one byte:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="n">byte</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c0</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c1</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c2</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c3</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span>
|
||||
</code></pre></div>
|
||||
<p>Decoding one byte into 4 ASCII characters:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="n">DEC4</span><span class="p">[</span><span class="n">byte</span><span class="p">].</span><span class="n">to_be_bytes</span><span class="p">()</span><span class="w"> </span><span class="c1">// [nuc0, nuc1, nuc2, nuc3] in ASCII</span>
|
||||
</code></pre></div>
|
||||
<h2 id="reverse-complement">Reverse complement</h2>
|
||||
<p>The reverse complement is computed <strong>in place</strong> with zero allocation in two steps.</p>
|
||||
<p><strong>Step 1 — byte swap with <code>REVCOMP4</code>.</strong> A 256-byte lookup table <code>REVCOMP4</code> maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying <code>REVCOMP4</code> to each:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">const</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">revcomp4</span><span class="p">(</span><span class="n">x</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="n">x</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement all bases</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
|
||||
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">>></span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o"><<</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
|
||||
<span class="w"> </span><span class="n">x</span>
|
||||
<span class="p">}</span>
|
||||
</code></pre></div>
|
||||
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
|
||||
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 − SEQL × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice<u8, Msb0>::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">SEQL</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
|
||||
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
|
||||
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
|
||||
</code></pre></div>
|
||||
<p><code>Msb0</code> ordering makes the bit layout hardware-independent.</p>
|
||||
<div class="admonition abstract">
|
||||
<p class="admonition-title">Algorithm — Super-kmer canonisation</p>
|
||||
<div class="highlight"><pre><span></span><code>procedure SuperKmerCanonical(seq, SEQL):
|
||||
for i ← 0 to SEQL − 1:
|
||||
fwd ← nucleotide(seq, i)
|
||||
rev ← complement(nucleotide(seq, SEQL − 1 − i))
|
||||
if fwd < rev: return seq -- forward is canonical
|
||||
if fwd > rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
|
||||
return seq -- palindrome: either orientation valid
|
||||
</code></pre></div>
|
||||
</div>
|
||||
<h2 id="kmer-extraction">Kmer extraction</h2>
|
||||
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i, k)</code>, which returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
|
||||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Result</span><span class="o"><</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">></span>
|
||||
</code></pre></div>
|
||||
<p>The bit slice <code>seq[i*2 .. (i+k)*2]</code> (Msb0 order) is loaded as a big-endian <code>u64</code> via <code>bitvec::load_be</code>, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.</p>
|
||||
<hr />
|
||||
<div class="admonition abstract">
|
||||
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
|
||||
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
|
||||
n ← ⌈SEQL / 4⌉ -- number of bytes
|
||||
shift ← n × 8 − SEQL × 2 -- padding bits to flush
|
||||
|
||||
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
|
||||
lo ← 0 ; hi ← n − 1
|
||||
while lo < hi:
|
||||
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
|
||||
lo ← lo + 1 ; hi ← hi − 1
|
||||
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
|
||||
|
||||
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
|
||||
if shift > 0:
|
||||
bits.rotate_left(shift)
|
||||
bits[n×8 − shift .. n×8].fill(0)
|
||||
</code></pre></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user