Files
obikmer/doc/theory/kmers/index.html
T
2026-04-19 12:17:16 +02:00

931 lines
20 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../..">
<link rel="next" href="../encoding/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Kmers and super-kmers - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#kmers-and-super-kmers" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#kmers" class="md-nav__link">
<span class="md-ellipsis">
Kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Super-kmers
</span>
</a>
<nav class="md-nav" aria-label="Super-kmers">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#canonical-super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Canonical super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
<span class="md-ellipsis">
Expected length of a super-kmer
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#kmers" class="md-nav__link">
<span class="md-ellipsis">
Kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Super-kmers
</span>
</a>
<nav class="md-nav" aria-label="Super-kmers">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#canonical-super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Canonical super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
<span class="md-ellipsis">
Expected length of a super-kmer
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="kmers-and-super-kmers">Kmers and super-kmers</h1>
<h2 id="kmers">Kmers</h2>
<p>A <strong>kmer</strong> is a DNA subsequence of fixed length k. Two constraints govern the choice of k:</p>
<ul>
<li><strong>k ∈ [11, 31]</strong>: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.</li>
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
</ul>
<h2 id="super-kmers">Super-kmers</h2>
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m &lt; k, m odd).</p>
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
</code></pre></div>
<p>When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.</p>
<h3 id="expected-length-of-a-super-kmer">Expected length of a super-kmer</h3>
<p>For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(km+2) (Golan &amp; Shur 2025; Zheng <em>et al.</em> 2020)<sup id="fnref:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> <sup id="fnref:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>, so the expected number of consecutive k-mers per super-kmer is (km+2)/2. A run of n k-mers spans n + k 1 nucleotides, giving:</p>
<div class="arithmatex">\[L_{\text{nt}} = \frac{k-m+2}{2} + k - 1\]</div>
<p>For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.<sup id="fnref:superkmer_length"><a class="footnote-ref" href="#fn:superkmer_length">1</a></sup></p>
<div class="footnote">
<hr />
<ol>
<li id="fn:superkmer_length">
<p>The expected length formula and the density approximation 2/(km+2) should be verified against the values reported in (Zheng <em>et al.</em> 2020)<sup id="fnref2:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> and (Golan &amp; Shur 2025)<sup id="fnref2:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>.&#160;<a class="footnote-backref" href="#fnref:superkmer_length" title="Jump back to footnote 1 in the text">&#8617;</a></p>
</li>
<li id="fn:Zheng2020-ji">
<p>Zheng, H., Kingsford, C. &amp; Marçais, G. (2020). <a href="https://doi.org/10.1093/bioinformatics/btaa472">Improved design and analysis of practical minimizers</a>. <em>Bioinformatics (Oxford, England)</em>, 36, i119--i127.&#160;<a class="footnote-backref" href="#fnref:Zheng2020-ji" title="Jump back to footnote 2 in the text">&#8617;</a><a class="footnote-backref" href="#fnref2:Zheng2020-ji" title="Jump back to footnote 2 in the text">&#8617;</a></p>
</li>
<li id="fn:Golan2025-xf">
<p>Golan, S. &amp; Shur, A.M. (2025). <a href="https://doi.org/10.1007/978-3-031-82670-2\_25">Expected density of random minimizers</a>. In: <em>Lecture notes in computer science</em>, Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360.&#160;<a class="footnote-backref" href="#fnref:Golan2025-xf" title="Jump back to footnote 3 in the text">&#8617;</a><a class="footnote-backref" href="#fnref2:Golan2025-xf" title="Jump back to footnote 3 in the text">&#8617;</a></p>
</li>
</ol>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>