first implementation but far to be optimal
This commit is contained in:
@@ -0,0 +1,890 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../kmers/">
|
||||
|
||||
|
||||
<link rel="next" href="../entropy/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>DNA encoding - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#dna-encoding" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
DNA encoding
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#2-bit-nucleotide-encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
2-bit nucleotide encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmer-encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmer encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#2-bit-nucleotide-encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
2-bit nucleotide encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmer-encoding" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmer encoding
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="dna-encoding">DNA encoding</h1>
|
||||
<h2 id="2-bit-nucleotide-encoding">2-bit nucleotide encoding</h2>
|
||||
<p>All nucleotides are encoded on 2 bits, MSB-first within each word. Nucleotides are numbered 0-based from the 5′ end across all sequence types:</p>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Base</th>
|
||||
<th>Encoding</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>A</td>
|
||||
<td><code>00</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>C</td>
|
||||
<td><code>01</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>G</td>
|
||||
<td><code>10</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>T</td>
|
||||
<td><code>11</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The Watson-Crick complement of any base is its bitwise NOT on 2 bits: <code>complement(base) = ~base & 0b11</code>.</p>
|
||||
<h2 id="kmer-encoding">Kmer encoding</h2>
|
||||
<p>A kmer fits in a single <code>u64</code>. Nucleotide 0 occupies bits 63–62, nucleotide i occupies bits 63−2i and 62−2i, and the low 64−2k bits are zero. Extraction of nucleotide i (0 ≤ i < k): <code>(kmer >> (62 - 2*i)) & 0b11</code>.</p>
|
||||
<p>Reverse complement is computed via a <strong>16-bit lookup table</strong> (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.</p>
|
||||
<div class="admonition abstract">
|
||||
<p class="admonition-title">Algorithm — Kmer reverse complement</p>
|
||||
<div class="highlight"><pre><span></span><code>procedure KmerRevcomp(kmer, k):
|
||||
raw ← TABLE16[kmer & 0xFFFF] << 48
|
||||
| TABLE16[(kmer >> 16) & 0xFFFF] << 32
|
||||
| TABLE16[(kmer >> 32) & 0xFFFF] << 16
|
||||
| TABLE16[(kmer >> 48) & 0xFFFF]
|
||||
return raw << (64 - 2*k)
|
||||
</code></pre></div>
|
||||
</div>
|
||||
<p>The <strong>canonical form</strong> is the lexicographic minimum of the kmer and its reverse complement:</p>
|
||||
<div class="highlight"><pre><span></span><code>canonical(kmer) = min(kmer, revcomp(kmer))
|
||||
</code></pre></div>
|
||||
<p>This halves the kmer space and ensures strand-independent counting.</p>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,990 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../encoding/">
|
||||
|
||||
|
||||
<link rel="next" href="../indexing/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>Entropy filter - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#kmer-entropy-filter" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Entropy filter
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#sub-word-frequencies" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Sub-word frequencies
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#corrected-shannon-entropy" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Corrected Shannon entropy
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Maximum entropy correction for small samples
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalized-entropy" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalized entropy
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#final-score" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Final score
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Interpretation as an effective number of classes
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#properties" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Properties
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#sub-word-frequencies" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Sub-word frequencies
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#corrected-shannon-entropy" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Corrected Shannon entropy
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Maximum entropy correction for small samples
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#normalized-entropy" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Normalized entropy
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#final-score" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Final score
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Interpretation as an effective number of classes
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#properties" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Properties
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="kmer-entropy-filter">Kmer entropy filter</h1>
|
||||
<p>Low-complexity kmers (polyA, polyT, tandem repeats) are detected and excluded during phase 1. The filter computes a <strong>normalized Shannon entropy</strong> over sub-words of multiple sizes, corrected for two sources of bias: the small number of observations within a single kmer, and the unequal sizes of circular equivalence classes.</p>
|
||||
<h2 id="sub-word-frequencies">Sub-word frequencies</h2>
|
||||
<p>For a kmer of length k and a sub-word size ws (1 ≤ ws ≤ ws_max, typically ws_max = 6), extract the <span class="arithmatex">\(n_{\text{words}} = k - ws + 1\)</span> overlapping sub-words by sliding a window of length ws:</p>
|
||||
<div class="arithmatex">\[w_i = \text{kmer}[i \mathinner{..} i+ws-1], \quad i = 0, \ldots, n_{\text{words}}-1\]</div>
|
||||
<p>Each sub-word is mapped to its <strong>circular canonical form</strong>: the lexicographic minimum among all cyclic rotations of the word <strong>and all cyclic rotations of its reverse complement</strong>. This extended equivalence relation ensures that entropy(K) = entropy(revcomp(K)) — the filter is strand-symmetric. Let <span class="arithmatex">\(s_j\)</span> be the size of equivalence class <span class="arithmatex">\(j\)</span> (number of distinct raw words mapping to canonical form <span class="arithmatex">\(j\)</span>), and <span class="arithmatex">\(f_j\)</span> the count of canonical form <span class="arithmatex">\(j\)</span> among the <span class="arithmatex">\(n_{\text{words}}\)</span> sub-words (<span class="arithmatex">\(\sum_j f_j = n_{\text{words}}\)</span>).</p>
|
||||
<h2 id="corrected-shannon-entropy">Corrected Shannon entropy</h2>
|
||||
<p>The circular equivalence classes have unequal sizes: under a uniform distribution over all <span class="arithmatex">\(4^{ws}\)</span> raw words, class <span class="arithmatex">\(j\)</span> is visited with probability <span class="arithmatex">\(s_j / 4^{ws}\)</span>, not <span class="arithmatex">\(1/n_a\)</span>. Computing entropy directly over canonical classes therefore underestimates the entropy of a random sequence.</p>
|
||||
<p>The correction "unfolds" each canonical class back to its member raw words, redistributing each observation of class <span class="arithmatex">\(j\)</span> equally among its <span class="arithmatex">\(s_j\)</span> members:</p>
|
||||
<div class="arithmatex">\[H_{\text{corr}} = \log(n_{\text{words}}) - \frac{1}{n_{\text{words}}} \sum_j f_j \log f_j + \frac{1}{n_{\text{words}}} \sum_j f_j \log s_j\]</div>
|
||||
<p>The last term is the correction for unequal class sizes. For a uniformly random sequence (<span class="arithmatex">\(f_j \approx n_{\text{words}} \cdot s_j / 4^{ws}\)</span>), this gives <span class="arithmatex">\(H_{\text{corr}} \approx \log(4^{ws}) = 2 \cdot ws \cdot \log 2\)</span>, the maximum entropy over raw words.</p>
|
||||
<h2 id="maximum-entropy-correction-for-small-samples">Maximum entropy correction for small samples</h2>
|
||||
<p>With only <span class="arithmatex">\(n_{\text{words}}\)</span> observations over <span class="arithmatex">\(4^{ws}\)</span> possible raw words, the achievable maximum entropy is bounded by the most uniform integer distribution over <span class="arithmatex">\(4^{ws}\)</span> categories.</p>
|
||||
<p>Let <span class="arithmatex">\(c = \lfloor n_{\text{words}} / 4^{ws} \rfloor\)</span> and <span class="arithmatex">\(r = n_{\text{words}} \bmod 4^{ws}\)</span>. The most uniform integer distribution assigns frequency <span class="arithmatex">\(c+1\)</span> to <span class="arithmatex">\(r\)</span> categories and <span class="arithmatex">\(c\)</span> to the remaining <span class="arithmatex">\(4^{ws} - r\)</span>, with the convention <span class="arithmatex">\(0 \log 0 = 0\)</span>:</p>
|
||||
<div class="arithmatex">\[H_{\max} = -\left[(4^{ws} - r)\,\frac{c}{n_{\text{words}}}\log\frac{c}{n_{\text{words}}} + r\,\frac{c+1}{n_{\text{words}}}\log\frac{c+1}{n_{\text{words}}}\right]\]</div>
|
||||
<p>When <span class="arithmatex">\(n_{\text{words}} < 4^{ws}\)</span>: <span class="arithmatex">\(c=0\)</span>, <span class="arithmatex">\(r=n_{\text{words}}\)</span>, and the formula reduces to <span class="arithmatex">\(H_{\max} = \log(n_{\text{words}})\)</span> — a single unified expression covers both regimes. A truly random sequence achieves <span class="arithmatex">\(H_{\text{corr}} \approx H_{\max}\)</span>.</p>
|
||||
<h2 id="normalized-entropy">Normalized entropy</h2>
|
||||
<div class="arithmatex">\[\hat{H}(ws) = \frac{H_{\text{corr}}}{H_{\max}} \in [0, 1]\]</div>
|
||||
<h2 id="final-score">Final score</h2>
|
||||
<p>The filter computes <span class="arithmatex">\(\hat{H}(ws)\)</span> for each word size ws from 1 to ws_max and returns the <strong>minimum</strong>:</p>
|
||||
<div class="arithmatex">\[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]</div>
|
||||
<p>A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if <span class="arithmatex">\(\text{entropy}(kmer) \leq \theta\)</span>, where <span class="arithmatex">\(\theta\)</span> is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.</p>
|
||||
<h2 id="interpretation-as-an-effective-number-of-classes">Interpretation as an effective number of classes</h2>
|
||||
<p><span class="arithmatex">\(H_{\text{corr}}\)</span> is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: <span class="arithmatex">\(N_{\text{eff}} = e^{H_{\text{corr}}}\)</span> is the number of equiprobable classes that would yield the same entropy.</p>
|
||||
<p>For the normalised score <span class="arithmatex">\(\hat{H}\)</span>, dividing by <span class="arithmatex">\(H_{\text{max}}\)</span> changes the logarithm base:</p>
|
||||
<div class="arithmatex">\[\hat{H} = \frac{\log N_{\text{eff}}}{\log N_{\text{max}}} = \log_{N_{\text{max}}} N_{\text{eff}} \quad \Longleftrightarrow \quad N_{\text{eff}} = N_{\text{max}}^{\,\hat{H}}\]</div>
|
||||
<p>The property is preserved: <span class="arithmatex">\(\hat{H}\)</span> is the logarithm (in base <span class="arithmatex">\(N_{\text{max}}\)</span>) of the effective number of equi-represented classes.</p>
|
||||
<p>In the large-sample limit (<span class="arithmatex">\(n_{\text{words}} \gg 4^{ws}\)</span>), <span class="arithmatex">\(N_{\text{max}} \approx 4^{ws}\)</span>, giving:</p>
|
||||
<div class="arithmatex">\[N_{\text{eff}} \approx 4^{ws \cdot \hat{H}}\]</div>
|
||||
<p>This has a clean interpretation: <span class="arithmatex">\(ws \cdot \hat{H}\)</span> is the <strong>effective word length</strong> (in bases) of a perfectly uniform distribution that would produce the same entropy. At <span class="arithmatex">\(\hat{H} = 1\)</span> the full space of <span class="arithmatex">\(4^{ws}\)</span> words is used; at <span class="arithmatex">\(\hat{H} = 0.5\)</span> with ws=2, only <span class="arithmatex">\(4^1 = 4\)</span> effective classes out of 16 are occupied.</p>
|
||||
<p>In our actual regime, <span class="arithmatex">\(n_{\text{words}}\)</span> is small and <span class="arithmatex">\(4^{ws}\)</span> can exceed <span class="arithmatex">\(n_{\text{words}}\)</span>, so <span class="arithmatex">\(H_{\text{max}} < \log(4^{ws})\)</span> due to the small-sample correction. The exact effective count is <span class="arithmatex">\(N_{\text{max}}^{\hat{H}}\)</span>, not <span class="arithmatex">\(4^{ws \cdot \hat{H}}\)</span>.</p>
|
||||
<h2 id="properties">Properties</h2>
|
||||
<p>The entropy score is a function of the kmer sequence alone — it does not depend on the surrounding context or on the position within any genome. Two consequences:</p>
|
||||
<ul>
|
||||
<li><strong>Orientation invariance</strong>: <span class="arithmatex">\(\text{entropy}(K) = \text{entropy}(\text{revcomp}(K))\)</span>, guaranteed by the strand-symmetric canonical form.</li>
|
||||
<li><strong>Context independence</strong>: the same kmer is always rejected or always kept, regardless of which genome it occurs in, where in that genome it appears, or which strand is considered. The filter defines a fixed partition of the kmer space into low-complexity and valid kmers.</li>
|
||||
</ul>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,910 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../entropy/">
|
||||
|
||||
|
||||
<link rel="next" href="../../implementation/superkmer/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>Partitioning architecture - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#partitioning-and-indexing-architecture" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../kmers/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#why-hashing-is-necessary" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Why hashing is necessary
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parameter-choices" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Parameter choices
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#why-hashing-is-necessary" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Why hashing is necessary
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#parameter-choices" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Parameter choices
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="partitioning-and-indexing-architecture">Partitioning and indexing architecture</h1>
|
||||
<p>The canonical minimizer of a super-kmer is hashed to produce a <strong>p-bit routing value</strong> (p is a collection-level parameter):</p>
|
||||
<div class="highlight"><pre><span></span><code>canonical minimizer → hash(minimizer) → p-bit value → PART → partition directory
|
||||
</code></pre></div>
|
||||
<p>PART is computed once at phase 1 to open the correct partition file, then discarded. It is recomputed on the fly at query time. It is never stored in the super-kmer header.</p>
|
||||
<p>Each partition holds one MPHF instance (phase 6) that indexes kmers as plain u64 values — the minimizer plays no role inside the partition.</p>
|
||||
<h2 id="why-hashing-is-necessary">Why hashing is necessary</h2>
|
||||
<p>The canonical minimizer is an m-mer (m ∈ {9, 11, 13, 15}), encoded in 2m bits (18 to 30 bits). Its distribution over the <span class="arithmatex">\(4^m\)</span> possible values is <strong>not uniform</strong>: because the minimizer is the lexicographic minimum of a window of m-mers, small values are systematically over-represented (Golan & Shur 2025; Kille <em>et al.</em> 2023; Pan & Reinert 2024; Zheng <em>et al.</em> 2020, 2021)<sup id="fnref:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">1</a></sup> <sup id="fnref:Zheng2021-cc"><a class="footnote-ref" href="#fn:Zheng2021-cc">2</a></sup> <sup id="fnref:Pan2024-hb"><a class="footnote-ref" href="#fn:Pan2024-hb">3</a></sup> <sup id="fnref:Kille2023-px"><a class="footnote-ref" href="#fn:Kille2023-px">4</a></sup> <sup id="fnref:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">5</a></sup>. Routing directly by the raw minimizer value would produce severely unbalanced partitions.</p>
|
||||
<p>A hash function with good avalanche properties redistributes this skewed distribution uniformly over the <span class="arithmatex">\(2^p\)</span> partition slots. The key reason this works well is the <strong>entropy gap</strong>: p is chosen to be much smaller than 2m, so the hash compresses many distinct minimizer values into each partition slot. Even under strong bias in the minimizer distribution, as long as its effective entropy exceeds p bits — which holds comfortably since the set of distinct minimizers in any real dataset is far larger than <span class="arithmatex">\(2^p\)</span> — the load imbalance across partitions is negligible.</p>
|
||||
<h2 id="parameter-choices">Parameter choices</h2>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>m</th>
|
||||
<th>2m (bits)</th>
|
||||
<th>Typical p</th>
|
||||
<th>Partitions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>9</td>
|
||||
<td>18</td>
|
||||
<td>6–8</td>
|
||||
<td>64–256</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>11</td>
|
||||
<td>22</td>
|
||||
<td>8–10</td>
|
||||
<td>256–1 024</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>13</td>
|
||||
<td>26</td>
|
||||
<td>10–12</td>
|
||||
<td>1 024–4 096</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>15</td>
|
||||
<td>30</td>
|
||||
<td>10–14</td>
|
||||
<td>1 024–16 384</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p>The hard constraint is p ≤ 2m: one cannot extract more bits of uniform randomness from a source than it contains. In practice p is chosen well below 2m, leaving a large entropy margin that absorbs the minimizer bias. For k=31, m=13, p=10: 1 024 partitions with comfortable balance.</p>
|
||||
<div class="footnote">
|
||||
<hr />
|
||||
<ol>
|
||||
<li id="fn:Zheng2020-ji">
|
||||
<p>Zheng, H., Kingsford, C. & Marçais, G. (2020). <a href="https://doi.org/10.1093/bioinformatics/btaa472">Improved design and analysis of practical minimizers</a>. <em>Bioinformatics (Oxford, England)</em>, 36, i119--i127. <a class="footnote-backref" href="#fnref:Zheng2020-ji" title="Jump back to footnote 1 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Zheng2021-cc">
|
||||
<p>Zheng, H., Kingsford, C. & Marçais, G. (2021). <a href="https://doi.org/10.1093/bioinformatics/btab313">Sequence-specific minimizers via polar sets</a>. <em>Bioinformatics (Oxford, England)</em>, 37, i187--i195. <a class="footnote-backref" href="#fnref:Zheng2021-cc" title="Jump back to footnote 2 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Pan2024-hb">
|
||||
<p>Pan, C. & Reinert, K. (2024). <a href="https://doi.org/10.1093/bioinformatics/btae045">A simple refined DNA minimizer operator enables 2-fold faster computation</a>. <em>Bioinformatics (Oxford, England)</em>, 40. <a class="footnote-backref" href="#fnref:Pan2024-hb" title="Jump back to footnote 3 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Kille2023-px">
|
||||
<p>Kille, B., Garrison, E., Treangen, T.J. & Phillippy, A.M. (2023). <a href="https://doi.org/10.1093/bioinformatics/btad512">Minmers are a generalization of minimizers that enable unbiased local jaccard estimation</a>. <em>Bioinformatics (Oxford, England)</em>, 39. <a class="footnote-backref" href="#fnref:Kille2023-px" title="Jump back to footnote 4 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Golan2025-xf">
|
||||
<p>Golan, S. & Shur, A.M. (2025). <a href="https://doi.org/10.1007/978-3-031-82670-2\_25">Expected density of random minimizers</a>. In: <em>Lecture notes in computer science</em>, Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360. <a class="footnote-backref" href="#fnref:Golan2025-xf" title="Jump back to footnote 5 in the text">↩</a></p>
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,931 @@
|
||||
|
||||
<!doctype html>
|
||||
<html lang="en" class="no-js">
|
||||
<head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="prev" href="../..">
|
||||
|
||||
|
||||
<link rel="next" href="../encoding/">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="icon" href="../../assets/images/favicon.png">
|
||||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||||
|
||||
|
||||
|
||||
<title>Kmers and super-kmers - obikmer</title>
|
||||
|
||||
|
||||
|
||||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||||
|
||||
|
||||
|
||||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
|
||||
<body dir="ltr">
|
||||
|
||||
|
||||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||||
<label class="md-overlay" for="__drawer"></label>
|
||||
<div data-md-component="skip">
|
||||
|
||||
|
||||
<a href="#kmers-and-super-kmers" class="md-skip">
|
||||
Skip to content
|
||||
</a>
|
||||
|
||||
</div>
|
||||
<div data-md-component="announce">
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<header class="md-header md-header--shadow" data-md-component="header">
|
||||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
<label class="md-header__button md-icon" for="__drawer">
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||||
</label>
|
||||
<div class="md-header__title" data-md-component="header-title">
|
||||
<div class="md-header__ellipsis">
|
||||
<div class="md-header__topic">
|
||||
<span class="md-ellipsis">
|
||||
obikmer
|
||||
</span>
|
||||
</div>
|
||||
<div class="md-header__topic" data-md-component="header-topic">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||||
|
||||
|
||||
|
||||
|
||||
</nav>
|
||||
|
||||
</header>
|
||||
|
||||
<div class="md-container" data-md-component="container">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<main class="md-main" data-md-component="main">
|
||||
<div class="md-main__inner md-grid">
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||||
<label class="md-nav__title" for="__drawer">
|
||||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||||
|
||||
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||||
|
||||
</a>
|
||||
obikmer
|
||||
</label>
|
||||
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../.." class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Home
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
|
||||
<label class="md-nav__title" for="__nav_2">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Theory
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--active">
|
||||
|
||||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<a href="./" class="md-nav__link md-nav__link--active">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmers and super-kmers
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#super-kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Super-kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Super-kmers">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#canonical-super-kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Canonical super-kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Expected length of a super-kmer
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../encoding/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
DNA encoding
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../entropy/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Entropy filter
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../indexing/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Partitioning architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_3">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Implementation
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
SuperKmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Kmer
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Chunk reader
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Construction pipeline
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/storage/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
On-disk storage
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
MPHF selection
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item md-nav__item--nested">
|
||||
|
||||
|
||||
|
||||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||||
|
||||
|
||||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
</label>
|
||||
|
||||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||||
<label class="md-nav__title" for="__nav_4">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
|
||||
|
||||
Architecture
|
||||
|
||||
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-scrollfix>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||||
|
||||
|
||||
|
||||
<span class="md-ellipsis">
|
||||
|
||||
|
||||
Sequences
|
||||
|
||||
|
||||
|
||||
</span>
|
||||
|
||||
|
||||
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||||
<div class="md-sidebar__scrollwrap">
|
||||
<div class="md-sidebar__inner">
|
||||
|
||||
|
||||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<label class="md-nav__title" for="__toc">
|
||||
<span class="md-nav__icon md-icon"></span>
|
||||
Table of contents
|
||||
</label>
|
||||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#super-kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Super-kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
<nav class="md-nav" aria-label="Super-kmers">
|
||||
<ul class="md-nav__list">
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#canonical-super-kmers" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Canonical super-kmers
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
<li class="md-nav__item">
|
||||
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
|
||||
<span class="md-ellipsis">
|
||||
|
||||
Expected length of a super-kmer
|
||||
|
||||
</span>
|
||||
</a>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav>
|
||||
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="md-content" data-md-component="content">
|
||||
|
||||
<article class="md-content__inner md-typeset">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<h1 id="kmers-and-super-kmers">Kmers and super-kmers</h1>
|
||||
<h2 id="kmers">Kmers</h2>
|
||||
<p>A <strong>kmer</strong> is a DNA subsequence of fixed length k. Two constraints govern the choice of k:</p>
|
||||
<ul>
|
||||
<li><strong>k ∈ [11, 31]</strong>: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.</li>
|
||||
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
|
||||
</ul>
|
||||
<h2 id="super-kmers">Super-kmers</h2>
|
||||
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k−1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m < k, m odd).</p>
|
||||
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
|
||||
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
|
||||
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
|
||||
</code></pre></div>
|
||||
<p>When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.</p>
|
||||
<h3 id="expected-length-of-a-super-kmer">Expected length of a super-kmer</h3>
|
||||
<p>For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(k−m+2) (Golan & Shur 2025; Zheng <em>et al.</em> 2020)<sup id="fnref:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> <sup id="fnref:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>, so the expected number of consecutive k-mers per super-kmer is (k−m+2)/2. A run of n k-mers spans n + k − 1 nucleotides, giving:</p>
|
||||
<div class="arithmatex">\[L_{\text{nt}} = \frac{k-m+2}{2} + k - 1\]</div>
|
||||
<p>For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.<sup id="fnref:superkmer_length"><a class="footnote-ref" href="#fn:superkmer_length">1</a></sup></p>
|
||||
<div class="footnote">
|
||||
<hr />
|
||||
<ol>
|
||||
<li id="fn:superkmer_length">
|
||||
<p>The expected length formula and the density approximation 2/(k−m+2) should be verified against the values reported in (Zheng <em>et al.</em> 2020)<sup id="fnref2:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> and (Golan & Shur 2025)<sup id="fnref2:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>. <a class="footnote-backref" href="#fnref:superkmer_length" title="Jump back to footnote 1 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Zheng2020-ji">
|
||||
<p>Zheng, H., Kingsford, C. & Marçais, G. (2020). <a href="https://doi.org/10.1093/bioinformatics/btaa472">Improved design and analysis of practical minimizers</a>. <em>Bioinformatics (Oxford, England)</em>, 36, i119--i127. <a class="footnote-backref" href="#fnref:Zheng2020-ji" title="Jump back to footnote 2 in the text">↩</a><a class="footnote-backref" href="#fnref2:Zheng2020-ji" title="Jump back to footnote 2 in the text">↩</a></p>
|
||||
</li>
|
||||
<li id="fn:Golan2025-xf">
|
||||
<p>Golan, S. & Shur, A.M. (2025). <a href="https://doi.org/10.1007/978-3-031-82670-2\_25">Expected density of random minimizers</a>. In: <em>Lecture notes in computer science</em>, Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360. <a class="footnote-backref" href="#fnref:Golan2025-xf" title="Jump back to footnote 3 in the text">↩</a><a class="footnote-backref" href="#fnref2:Golan2025-xf" title="Jump back to footnote 3 in the text">↩</a></p>
|
||||
</li>
|
||||
</ol>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
</article>
|
||||
</div>
|
||||
|
||||
|
||||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
<footer class="md-footer">
|
||||
|
||||
<div class="md-footer-meta md-typeset">
|
||||
<div class="md-footer-meta__inner md-grid">
|
||||
<div class="md-copyright">
|
||||
|
||||
|
||||
Made with
|
||||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||||
Material for MkDocs
|
||||
</a>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
<div class="md-dialog" data-md-component="dialog">
|
||||
<div class="md-dialog__inner md-typeset"></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||||
|
||||
|
||||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||||
|
||||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||||
|
||||
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user