Files
obikmer/doc/implementation/superkmer/index.html
T

1172 lines
39 KiB
HTML
Raw Normal View History

2026-04-16 22:38:20 +02:00
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../../theory/indexing/">
<link rel="next" href="../kmer/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>SuperKmer - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#superkmer-implementation" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
SuperKmer
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../kmers/" class="md-nav__link">
2026-04-16 22:38:20 +02:00
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/minimizer/" class="md-nav__link">
<span class="md-ellipsis">
Minimizer selection
</span>
</a>
</li>
2026-04-16 22:38:20 +02:00
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
SuperKmer
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
<span class="md-ellipsis">
ASCII encoding and decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#minimizer-sliding-window" class="md-nav__link">
<span class="md-ellipsis">
Minimizer sliding window
</span>
</a>
2026-04-16 22:38:20 +02:00
</li>
<li class="md-nav__item">
<a href="#kmer-extraction" class="md-nav__link">
<span class="md-ellipsis">
Kmer extraction
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obipipeline/" class="md-nav__link">
<span class="md-ellipsis">
obipipeline library
</span>
</a>
</li>
2026-04-16 22:38:20 +02:00
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../unitig_evidence/" class="md-nav__link">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</a>
</li>
2026-04-16 22:38:20 +02:00
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
<span class="md-ellipsis">
ASCII encoding and decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#minimizer-sliding-window" class="md-nav__link">
<span class="md-ellipsis">
Minimizer sliding window
</span>
</a>
2026-04-16 22:38:20 +02:00
</li>
<li class="md-nav__item">
<a href="#kmer-extraction" class="md-nav__link">
<span class="md-ellipsis">
Kmer extraction
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
<h2 id="memory-layout">Memory layout</h2>
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte):</p>
2026-04-16 22:38:20 +02:00
<table>
<thead>
<tr>
<th>Field</th>
<th>Bits</th>
<th>Role</th>
</tr>
</thead>
<tbody>
<tr>
<td>COUNT</td>
<td>24</td>
<td>Occurrence count (≤ 16 M)</td>
</tr>
<tr>
<td>NKMERS</td>
2026-04-16 22:38:20 +02:00
<td>8</td>
<td>Number of kmers (= seq_length k + 1, range 1255)</td>
2026-04-16 22:38:20 +02:00
</tr>
</tbody>
</table>
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] NKMERS</code></p>
<p>NKMERS is stored as a raw <code>u8</code> in <strong>kmer units</strong>, not nucleotides. The nucleotide length is recovered as <code>NKMERS + k 1</code>. This avoids the awkward wrapping convention (<code>0 = 256</code>) that would be needed if nucleotide length were stored directly, and gains k1 = 30 units of headroom:</p>
<table>
<thead>
<tr>
<th>unit</th>
<th>u8 covers</th>
<th>max nucleotides</th>
</tr>
</thead>
<tbody>
<tr>
<td>nucleotides</td>
<td>255 nt</td>
<td>225 kmers</td>
</tr>
<tr>
<td><strong>kmers</strong></td>
<td><strong>255 kmers</strong></td>
<td><strong>285 nt</strong></td>
</tr>
</tbody>
</table>
<p>The public accessors:</p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">n_kmers</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">K</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="p">}</span>
2026-04-16 22:38:20 +02:00
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span><span class="w"> </span><span class="p">}</span>
</code></pre></div>
<p>In practice, observed super-kmer lengths on metagenomic data (k=31) are below 55 nucleotides (≤ 25 kmers) — far from the 255-kmer cap. If a super-kmer ever exceeds 255 kmers, it is split with a k1 nucleotide overlap, preserving all kmers without duplication (identical mechanism to partition-boundary splits).</p>
2026-04-16 22:38:20 +02:00
<p>The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.</p>
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
<ul>
<li><strong><code>ENC: [u8; 32]</code></strong> — indexed by <code>b &amp; 0x1F</code> (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.</li>
<li><strong><code>DEC4: [u32; 256]</code></strong> — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian <code>u32</code>. 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.</li>
</ul>
<p>Encoding 4 nucleotides into one byte:</p>
<div class="highlight"><pre><span></span><code><span class="n">byte</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c0</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c1</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c2</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c3</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span>
</code></pre></div>
<p>Decoding one byte into 4 ASCII characters:</p>
<div class="highlight"><pre><span></span><code><span class="n">DEC4</span><span class="p">[</span><span class="n">byte</span><span class="p">].</span><span class="n">to_be_bytes</span><span class="p">()</span><span class="w"> </span><span class="c1">// [nuc0, nuc1, nuc2, nuc3] in ASCII</span>
</code></pre></div>
<h2 id="reverse-complement">Reverse complement</h2>
<p>The reverse complement is computed <strong>in place</strong> with zero allocation in two steps.</p>
<p><strong>Step 1 — byte swap with <code>REVCOMP4</code>.</strong> A 256-byte lookup table <code>REVCOMP4</code> maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying <code>REVCOMP4</code> to each:</p>
<div class="highlight"><pre><span></span><code><span class="k">const</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">revcomp4</span><span class="p">(</span><span class="n">x</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="n">x</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement all bases</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
<span class="w"> </span><span class="n">x</span>
<span class="p">}</span>
</code></pre></div>
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 seql × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice&lt;u8, Msb0&gt;::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">n_kmers</span><span class="p">()</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="n">k</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">1</span><span class="p">;</span>
<span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">seql</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
2026-04-16 22:38:20 +02:00
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
</code></pre></div>
<p><code>Msb0</code> ordering makes the bit layout hardware-independent.</p>
<div class="admonition abstract">
<p class="admonition-title">Algorithm — Super-kmer canonisation</p>
<div class="highlight"><pre><span></span><code>procedure SuperKmerCanonical(seq, SEQL):
for i ← 0 to SEQL 1:
fwd ← nucleotide(seq, i)
rev ← complement(nucleotide(seq, SEQL 1 i))
if fwd &lt; rev: return seq -- forward is canonical
if fwd &gt; rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
return seq -- palindrome: either orientation valid
</code></pre></div>
</div>
<h2 id="minimizer-sliding-window">Minimizer sliding window</h2>
<p>Super-kmers are built by <code>SuperKmerIter</code> (crate <code>obiskbuilder</code>), which maintains the current minimizer with a <strong>monotonic deque</strong> over a sliding window of W = k m + 1 m-mer positions.</p>
<p>Each deque entry stores:</p>
<table>
<thead>
<tr>
<th>Field</th>
<th>Type</th>
<th>Purpose</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>position</code></td>
<td>usize</td>
<td>0-based start of this m-mer in the segment</td>
</tr>
<tr>
<td><code>canonical</code></td>
<td>u64</td>
<td>right-aligned canonical m-mer value (lex-min of fwd and rc); used as partition key</td>
</tr>
<tr>
<td><code>hash</code></td>
<td>u64</td>
<td><span class="arithmatex">\(H(\text{canonical})\)</span> — ordering key for random minimizer selection</td>
</tr>
</tbody>
</table>
<p>The hash <span class="arithmatex">\(H\)</span> is the seeded splitmix64 finalizer (see <a href="../../theory/minimizer/">Minimizer selection</a>):</p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">hash_mmer</span><span class="p">(</span><span class="n">canonical</span><span class="p">:</span><span class="w"> </span><span class="kt">u64</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">canonical</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="mh">0x9e3779b97f4a7c15</span><span class="p">;</span><span class="w"> </span><span class="c1">// seed: eliminates fixed point at 0</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">30</span><span class="p">);</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0xbf58476d1ce4e5b9</span><span class="p">);</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">27</span><span class="p">);</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">wrapping_mul</span><span class="p">(</span><span class="mh">0x94d049bb133111eb</span><span class="p">);</span>
<span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">^</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">31</span><span class="p">)</span>
<span class="p">}</span>
</code></pre></div>
<p>On each new nucleotide, once the window is full, the deque is updated:</p>
<div class="admonition abstract">
<p class="admonition-title">Algorithm — minimizer deque update</p>
<div class="highlight"><pre><span></span><code>procedure UpdateMinimizer(deque, position, canonical, hash, k, received):
-- pop dominated entries from the back
while deque.back.hash ≥ hash:
deque.pop_back()
deque.push_back({position, canonical, hash})
-- evict expired entries from the front
while deque.front.position + k &lt; received:
deque.pop_front()
</code></pre></div>
</div>
<p>The front of the deque is always the current minimizer. Because the deque is maintained in strictly increasing hash order, each entry is popped at most once — O(1) amortized per nucleotide.</p>
<p>A super-kmer boundary is emitted when the minimizer changes: <code>deque.front.hash ≠ prev_hash</code>. The <code>canonical</code> field of the front entry is <strong>not</strong> used for boundary detection — that uses the hash alone. The canonical value is stored so that the partition key <span class="arithmatex">\(H(\text{canonical})\)</span> can be recomputed independently at routing time from the stored <code>minimizer_pos</code>, without inheriting the minimum-order-statistic bias (see <a href="../../theory/minimizer/#partition-key-independence">Minimizer selection — partition key independence</a>).</p>
2026-04-16 22:38:20 +02:00
<h2 id="kmer-extraction">Kmer extraction</h2>
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i, k)</code>, which returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nb">Result</span><span class="o">&lt;</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">&gt;</span>
</code></pre></div>
<p>The bit slice <code>seq[i*2 .. (i+k)*2]</code> (Msb0 order) is loaded as a big-endian <code>u64</code> via <code>bitvec::load_be</code>, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.</p>
<hr />
<div class="admonition abstract">
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
seql ← NKMERS + k 1 -- nucleotide length
n ← ⌈seql / 4⌉ -- number of bytes
shift ← n × 8 seql × 2 -- padding bits to flush
2026-04-16 22:38:20 +02:00
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
lo ← 0 ; hi ← n 1
while lo &lt; hi:
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
lo ← lo + 1 ; hi ← hi 1
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
if shift &gt; 0:
bits.rotate_left(shift)
bits[n×8 shift .. n×8].fill(0)
</code></pre></div>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>