first implementation but far to be optimal

This commit is contained in:
Eric Coissac
2026-04-16 22:38:20 +02:00
commit de3f9b16cf
19336 changed files with 380276 additions and 0 deletions
+716
View File
@@ -0,0 +1,716 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="icon" href="/assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>obikmer</title>
<link rel="stylesheet" href="/assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("/",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="/." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="/." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="/." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="/theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="/implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="/implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="/architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1>404 - Not found</h1>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "/", "features": [], "search": "/assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="/assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+2133
View File
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,765 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../../../implementation/mphf/">
<link rel="icon" href="../../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Sequences - obikmer</title>
<link rel="stylesheet" href="../../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#manipulated-sequences" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Sequences
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../../theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="manipulated-sequences">Manipulated sequences</h1>
<ul>
<li>We consider sequences only as compact form of a set of overlaping kmers</li>
<li>The largest kmers we considere are 31-mer</li>
<li>
<p>We only consider odd k</p>
</li>
<li>
<p>all sequences match /^[acgtACGT]+/</p>
</li>
<li>maximum length 256 nucleotides</li>
<li>minimum length k</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../../..", "features": [], "search": "../../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+18
View File
@@ -0,0 +1,18 @@
/*!
* Lunr languages, `Danish` language
* https://github.com/MihaiValentin/lunr-languages
*
* Copyright 2014, Mihai Valentin
* http://www.mozilla.org/MPL/
*/
/*!
* based on
* Snowball JavaScript Library v0.3
* http://code.google.com/p/urim/
* http://snowball.tartarus.org/
*
* Copyright 2010, Oleg Mazko
* http://www.mozilla.org/MPL/
*/
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.da=function(){this.pipeline.reset(),this.pipeline.add(e.da.trimmer,e.da.stopWordFilter,e.da.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.da.stemmer))},e.da.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.da.trimmer=e.trimmerSupport.generateTrimmer(e.da.wordCharacters),e.Pipeline.registerFunction(e.da.trimmer,"trimmer-da"),e.da.stemmer=function(){var r=e.stemmerSupport.Among,i=e.stemmerSupport.SnowballProgram,n=new function(){function e(){var e,r=f.cursor+3;if(d=f.limit,0<=r&&r<=f.limit){for(a=r;;){if(e=f.cursor,f.in_grouping(w,97,248)){f.cursor=e;break}if(f.cursor=e,e>=f.limit)return;f.cursor++}for(;!f.out_grouping(w,97,248);){if(f.cursor>=f.limit)return;f.cursor++}d=f.cursor,d<a&&(d=a)}}function n(){var e,r;if(f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(c,32),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del();break;case 2:f.in_grouping_b(p,97,229)&&f.slice_del()}}function t(){var e,r=f.limit-f.cursor;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.find_among_b(l,4)?(f.bra=f.cursor,f.limit_backward=e,f.cursor=f.limit-r,f.cursor>f.limit_backward&&(f.cursor--,f.bra=f.cursor,f.slice_del())):f.limit_backward=e)}function s(){var e,r,i,n=f.limit-f.cursor;if(f.ket=f.cursor,f.eq_s_b(2,"st")&&(f.bra=f.cursor,f.eq_s_b(2,"ig")&&f.slice_del()),f.cursor=f.limit-n,f.cursor>=d&&(r=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,e=f.find_among_b(m,5),f.limit_backward=r,e))switch(f.bra=f.cursor,e){case 1:f.slice_del(),i=f.limit-f.cursor,t(),f.cursor=f.limit-i;break;case 2:f.slice_from("løs")}}function o(){var e;f.cursor>=d&&(e=f.limit_backward,f.limit_backward=d,f.ket=f.cursor,f.out_grouping_b(w,97,248)?(f.bra=f.cursor,u=f.slice_to(u),f.limit_backward=e,f.eq_v_b(u)&&f.slice_del()):f.limit_backward=e)}var a,d,u,c=[new r("hed",-1,1),new r("ethed",0,1),new r("ered",-1,1),new r("e",-1,1),new r("erede",3,1),new r("ende",3,1),new r("erende",5,1),new r("ene",3,1),new r("erne",3,1),new r("ere",3,1),new r("en",-1,1),new r("heden",10,1),new r("eren",10,1),new r("er",-1,1),new r("heder",13,1),new r("erer",13,1),new r("s",-1,2),new r("heds",16,1),new r("es",16,1),new r("endes",18,1),new r("erendes",19,1),new r("enes",18,1),new r("ernes",18,1),new r("eres",18,1),new r("ens",16,1),new r("hedens",24,1),new r("erens",24,1),new r("ers",16,1),new r("ets",16,1),new r("erets",28,1),new r("et",-1,1),new r("eret",30,1)],l=[new r("gd",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("elig",1,1),new r("els",-1,1),new r("løst",-1,2)],w=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],p=[239,254,42,3,0,0,0,0,0,0,0,0,0,0,0,0,16],f=new i;this.setCurrent=function(e){f.setCurrent(e)},this.getCurrent=function(){return f.getCurrent()},this.stem=function(){var r=f.cursor;return e(),f.limit_backward=r,f.cursor=f.limit,n(),f.cursor=f.limit,t(),f.cursor=f.limit,s(),f.cursor=f.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return n.setCurrent(e),n.stem(),n.getCurrent()}):(n.setCurrent(e),n.stem(),n.getCurrent())}}(),e.Pipeline.registerFunction(e.da.stemmer,"stemmer-da"),e.da.stopWordFilter=e.generateStopWordFilter("ad af alle alt anden at blev blive bliver da de dem den denne der deres det dette dig din disse dog du efter eller en end er et for fra ham han hans har havde have hende hendes her hos hun hvad hvis hvor i ikke ind jeg jer jo kunne man mange med meget men mig min mine mit mod ned noget nogle nu når og også om op os over på selv sig sin sine sit skal skulle som sådan thi til ud under var vi vil ville vor være været".split(" ")),e.Pipeline.registerFunction(e.da.stopWordFilter,"stopWordFilter-da")}});
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hi=function(){this.pipeline.reset(),this.pipeline.add(e.hi.trimmer,e.hi.stopWordFilter,e.hi.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.hi.stemmer))},e.hi.wordCharacters="ऀ-ःऄ-एऐ-टठ-यर-िी-ॏॐ-य़ॠ-९॰-ॿa-zA-Z-zA-0-9-",e.hi.trimmer=e.trimmerSupport.generateTrimmer(e.hi.wordCharacters),e.Pipeline.registerFunction(e.hi.trimmer,"trimmer-hi"),e.hi.stopWordFilter=e.generateStopWordFilter("अत अपना अपनी अपने अभी अंदर आदि आप इत्यादि इन इनका इन्हीं इन्हें इन्हों इस इसका इसकी इसके इसमें इसी इसे उन उनका उनकी उनके उनको उन्हीं उन्हें उन्हों उस उसके उसी उसे एक एवं एस ऐसे और कई कर करता करते करना करने करें कहते कहा का काफ़ी कि कितना किन्हें किन्हों किया किर किस किसी किसे की कुछ कुल के को कोई कौन कौनसा गया घर जब जहाँ जा जितना जिन जिन्हें जिन्हों जिस जिसे जीधर जैसा जैसे जो तक तब तरह तिन तिन्हें तिन्हों तिस तिसे तो था थी थे दबारा दिया दुसरा दूसरे दो द्वारा न नके नहीं ना निहायत नीचे ने पर पहले पूरा पे फिर बनी बही बहुत बाद बाला बिलकुल भी भीतर मगर मानो मे में यदि यह यहाँ यही या यिह ये रखें रहा रहे ऱ्वासा लिए लिये लेकिन व वग़ैरह वर्ग वह वहाँ वहीं वाले वुह वे वो सकता सकते सबसे सभी साथ साबुत साभ सारा से सो संग ही हुआ हुई हुए है हैं हो होता होती होते होना होने".split(" ")),e.hi.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.hi.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var t=i.toString().toLowerCase().replace(/^\s+/,"");return r.cut(t).split("|")},e.Pipeline.registerFunction(e.hi.stemmer,"stemmer-hi"),e.Pipeline.registerFunction(e.hi.stopWordFilter,"stopWordFilter-hi")}});
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.hy=function(){this.pipeline.reset(),this.pipeline.add(e.hy.trimmer,e.hy.stopWordFilter)},e.hy.wordCharacters="[A-Za-z԰-֏ff-ﭏ]",e.hy.trimmer=e.trimmerSupport.generateTrimmer(e.hy.wordCharacters),e.Pipeline.registerFunction(e.hy.trimmer,"trimmer-hy"),e.hy.stopWordFilter=e.generateStopWordFilter("դու և եք էիր էիք հետո նաև նրանք որը վրա է որ պիտի են այս մեջ ն իր ու ի այդ որոնք այն կամ էր մի ես համար այլ իսկ էին ենք հետ ին թ էինք մենք նրա նա դուք եմ էի ըստ որպես ում".split(" ")),e.Pipeline.registerFunction(e.hy.stopWordFilter,"stopWordFilter-hy"),e.hy.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}(),e.Pipeline.registerFunction(e.hy.stemmer,"stemmer-hy")}});
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.ja=function(){this.pipeline.reset(),this.pipeline.add(e.ja.trimmer,e.ja.stopWordFilter,e.ja.stemmer),r?this.tokenizer=e.ja.tokenizer:(e.tokenizer&&(e.tokenizer=e.ja.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.ja.tokenizer))};var t=new e.TinySegmenter;e.ja.tokenizer=function(i){var n,o,s,p,a,u,m,l,c,f;if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t.toLowerCase()):t.toLowerCase()});for(o=i.toString().toLowerCase().replace(/^\s+/,""),n=o.length-1;n>=0;n--)if(/\S/.test(o.charAt(n))){o=o.substring(0,n+1);break}for(a=[],s=o.length,c=0,l=0;c<=s;c++)if(u=o.charAt(c),m=c-l,u.match(/\s/)||c==s){if(m>0)for(p=t.segment(o.slice(l,c)).filter(function(e){return!!e}),f=l,n=0;n<p.length;n++)r?a.push(new e.Token(p[n],{position:[f,p[n].length],index:a.length})):a.push(p[n]),f+=p[n].length;l=c+1}return a},e.ja.stemmer=function(){return function(e){return e}}(),e.Pipeline.registerFunction(e.ja.stemmer,"stemmer-ja"),e.ja.wordCharacters="一二三四五六七八九十百千万億兆一-龠々〆ヵヶぁ-んァ-ヴーア-ン゙a-zA-Z-zA-0-9-",e.ja.trimmer=e.trimmerSupport.generateTrimmer(e.ja.wordCharacters),e.Pipeline.registerFunction(e.ja.trimmer,"trimmer-ja"),e.ja.stopWordFilter=e.generateStopWordFilter("これ それ あれ この その あの ここ そこ あそこ こちら どこ だれ なに なん 何 私 貴方 貴方方 我々 私達 あの人 あのかた 彼女 彼 です あります おります います は が の に を で え から まで より も どの と し それで しかし".split(" ")),e.Pipeline.registerFunction(e.ja.stopWordFilter,"stopWordFilter-ja"),e.jp=e.ja,e.Pipeline.registerFunction(e.jp.stemmer,"stemmer-jp"),e.Pipeline.registerFunction(e.jp.trimmer,"trimmer-jp"),e.Pipeline.registerFunction(e.jp.stopWordFilter,"stopWordFilter-jp")}});
+1
View File
@@ -0,0 +1 @@
module.exports=require("./lunr.ja");
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.kn=function(){this.pipeline.reset(),this.pipeline.add(e.kn.trimmer,e.kn.stopWordFilter,e.kn.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.kn.stemmer))},e.kn.wordCharacters="ಀ-಄ಅ-ಔಕ-ಹಾ-ೌ಼-ಽೕ-ೖೝ-ೞೠ-ೡೢ-ೣ೤೥೦-೯ೱ-ೳ",e.kn.trimmer=e.trimmerSupport.generateTrimmer(e.kn.wordCharacters),e.Pipeline.registerFunction(e.kn.trimmer,"trimmer-kn"),e.kn.stopWordFilter=e.generateStopWordFilter("ಮತ್ತು ಈ ಒಂದು ರಲ್ಲಿ ಹಾಗೂ ಎಂದು ಅಥವಾ ಇದು ರ ಅವರು ಎಂಬ ಮೇಲೆ ಅವರ ತನ್ನ ಆದರೆ ತಮ್ಮ ನಂತರ ಮೂಲಕ ಹೆಚ್ಚು ನ ಆ ಕೆಲವು ಅನೇಕ ಎರಡು ಹಾಗು ಪ್ರಮುಖ ಇದನ್ನು ಇದರ ಸುಮಾರು ಅದರ ಅದು ಮೊದಲ ಬಗ್ಗೆ ನಲ್ಲಿ ರಂದು ಇತರ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ಸಹ ಸಾಮಾನ್ಯವಾಗಿ ನೇ ಹಲವಾರು ಹೊಸ ದಿ ಕಡಿಮೆ ಯಾವುದೇ ಹೊಂದಿದೆ ದೊಡ್ಡ ಅನ್ನು ಇವರು ಪ್ರಕಾರ ಇದೆ ಮಾತ್ರ ಕೂಡ ಇಲ್ಲಿ ಎಲ್ಲಾ ವಿವಿಧ ಅದನ್ನು ಹಲವು ರಿಂದ ಕೇವಲ ದ ದಕ್ಷಿಣ ಗೆ ಅವನ ಅತಿ ನೆಯ ಬಹಳ ಕೆಲಸ ಎಲ್ಲ ಪ್ರತಿ ಇತ್ಯಾದಿ ಇವು ಬೇರೆ ಹೀಗೆ ನಡುವೆ ಇದಕ್ಕೆ ಎಸ್ ಇವರ ಮೊದಲು ಶ್ರೀ ಮಾಡುವ ಇದರಲ್ಲಿ ರೀತಿಯ ಮಾಡಿದ ಕಾಲ ಅಲ್ಲಿ ಮಾಡಲು ಅದೇ ಈಗ ಅವು ಗಳು ಎ ಎಂಬುದು ಅವನು ಅಂದರೆ ಅವರಿಗೆ ಇರುವ ವಿಶೇಷ ಮುಂದೆ ಅವುಗಳ ಮುಂತಾದ ಮೂಲ ಬಿ ಮೀ ಒಂದೇ ಇನ್ನೂ ಹೆಚ್ಚಾಗಿ ಮಾಡಿ ಅವರನ್ನು ಇದೇ ಯ ರೀತಿಯಲ್ಲಿ ಜೊತೆ ಅದರಲ್ಲಿ ಮಾಡಿದರು ನಡೆದ ಆಗ ಮತ್ತೆ ಪೂರ್ವ ಆತ ಬಂದ ಯಾವ ಒಟ್ಟು ಇತರೆ ಹಿಂದೆ ಪ್ರಮಾಣದ ಗಳನ್ನು ಕುರಿತು ಯು ಆದ್ದರಿಂದ ಅಲ್ಲದೆ ನಗರದ ಮೇಲಿನ ಏಕೆಂದರೆ ರಷ್ಟು ಎಂಬುದನ್ನು ಬಾರಿ ಎಂದರೆ ಹಿಂದಿನ ಆದರೂ ಆದ ಸಂಬಂಧಿಸಿದ ಮತ್ತೊಂದು ಸಿ ಆತನ ".split(" ")),e.kn.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.kn.tokenizer=function(t){if(!arguments.length||null==t||void 0==t)return[];if(Array.isArray(t))return t.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var n=t.toString().toLowerCase().replace(/^\s+/,"");return r.cut(n).split("|")},e.Pipeline.registerFunction(e.kn.stemmer,"stemmer-kn"),e.Pipeline.registerFunction(e.kn.stopWordFilter,"stopWordFilter-kn")}});
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){e.multiLanguage=function(){for(var t=Array.prototype.slice.call(arguments),i=t.join("-"),r="",n=[],s=[],p=0;p<t.length;++p)"en"==t[p]?(r+="\\w",n.unshift(e.stopWordFilter),n.push(e.stemmer),s.push(e.stemmer)):(r+=e[t[p]].wordCharacters,e[t[p]].stopWordFilter&&n.unshift(e[t[p]].stopWordFilter),e[t[p]].stemmer&&(n.push(e[t[p]].stemmer),s.push(e[t[p]].stemmer)));var o=e.trimmerSupport.generateTrimmer(r);return e.Pipeline.registerFunction(o,"lunr-multi-trimmer-"+i),n.unshift(o),function(){this.pipeline.reset(),this.pipeline.add.apply(this.pipeline,n),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add.apply(this.searchPipeline,s))}}}});
File diff suppressed because one or more lines are too long
+18
View File
@@ -0,0 +1,18 @@
/*!
* Lunr languages, `Norwegian` language
* https://github.com/MihaiValentin/lunr-languages
*
* Copyright 2014, Mihai Valentin
* http://www.mozilla.org/MPL/
*/
/*!
* based on
* Snowball JavaScript Library v0.3
* http://code.google.com/p/urim/
* http://snowball.tartarus.org/
*
* Copyright 2010, Oleg Mazko
* http://www.mozilla.org/MPL/
*/
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.no=function(){this.pipeline.reset(),this.pipeline.add(e.no.trimmer,e.no.stopWordFilter,e.no.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.no.stemmer))},e.no.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.no.trimmer=e.trimmerSupport.generateTrimmer(e.no.wordCharacters),e.Pipeline.registerFunction(e.no.trimmer,"trimmer-no"),e.no.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,i=new function(){function e(){var e,r=w.cursor+3;if(a=w.limit,0<=r||r<=w.limit){for(s=r;;){if(e=w.cursor,w.in_grouping(d,97,248)){w.cursor=e;break}if(e>=w.limit)return;w.cursor=e+1}for(;!w.out_grouping(d,97,248);){if(w.cursor>=w.limit)return;w.cursor++}a=w.cursor,a<s&&(a=s)}}function i(){var e,r,n;if(w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(m,29),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:n=w.limit-w.cursor,w.in_grouping_b(c,98,122)?w.slice_del():(w.cursor=w.limit-n,w.eq_s_b(1,"k")&&w.out_grouping_b(d,97,248)&&w.slice_del());break;case 3:w.slice_from("er")}}function t(){var e,r=w.limit-w.cursor;w.cursor>=a&&(e=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,w.find_among_b(u,2)?(w.bra=w.cursor,w.limit_backward=e,w.cursor=w.limit-r,w.cursor>w.limit_backward&&(w.cursor--,w.bra=w.cursor,w.slice_del())):w.limit_backward=e)}function o(){var e,r;w.cursor>=a&&(r=w.limit_backward,w.limit_backward=a,w.ket=w.cursor,e=w.find_among_b(l,11),e?(w.bra=w.cursor,w.limit_backward=r,1==e&&w.slice_del()):w.limit_backward=r)}var s,a,m=[new r("a",-1,1),new r("e",-1,1),new r("ede",1,1),new r("ande",1,1),new r("ende",1,1),new r("ane",1,1),new r("ene",1,1),new r("hetene",6,1),new r("erte",1,3),new r("en",-1,1),new r("heten",9,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",12,1),new r("s",-1,2),new r("as",14,1),new r("es",14,1),new r("edes",16,1),new r("endes",16,1),new r("enes",16,1),new r("hetenes",19,1),new r("ens",14,1),new r("hetens",21,1),new r("ers",14,1),new r("ets",14,1),new r("et",-1,1),new r("het",25,1),new r("ert",-1,3),new r("ast",-1,1)],u=[new r("dt",-1,-1),new r("vt",-1,-1)],l=[new r("leg",-1,1),new r("eleg",0,1),new r("ig",-1,1),new r("eig",2,1),new r("lig",2,1),new r("elig",4,1),new r("els",-1,1),new r("lov",-1,1),new r("elov",7,1),new r("slov",7,1),new r("hetslov",9,1)],d=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,48,0,128],c=[119,125,149,1],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,i(),w.cursor=w.limit,t(),w.cursor=w.limit,o(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return i.setCurrent(e),i.stem(),i.getCurrent()}):(i.setCurrent(e),i.stem(),i.getCurrent())}}(),e.Pipeline.registerFunction(e.no.stemmer,"stemmer-no"),e.no.stopWordFilter=e.generateStopWordFilter("alle at av bare begge ble blei bli blir blitt både båe da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då eg ein eit eitt eller elles en enn er et ett etter for fordi fra før ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor i ikke ikkje ikkje ingen ingi inkje inn inni ja jeg kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor man mange me med medan meg meget mellom men mi min mine mitt mot mykje ned no noe noen noka noko nokon nokor nokre nå når og også om opp oss over på samme seg selv si si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn til um upp ut uten var vart varte ved vere verte vi vil ville vore vors vort vår være være vært å".split(" ")),e.Pipeline.registerFunction(e.no.stopWordFilter,"stopWordFilter-no")}});
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sa=function(){this.pipeline.reset(),this.pipeline.add(e.sa.trimmer,e.sa.stopWordFilter,e.sa.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sa.stemmer))},e.sa.wordCharacters="ऀ-ःऄ-एऐ-टठ-यर-िी-ॏॐ-य़ॠ-९॰-ॿ꣠-꣱ꣲ-ꣷ꣸-ꣻ꣼-ꣽꣾ-ꣿᆰ0-ᆰ9",e.sa.trimmer=e.trimmerSupport.generateTrimmer(e.sa.wordCharacters),e.Pipeline.registerFunction(e.sa.trimmer,"trimmer-sa"),e.sa.stopWordFilter=e.generateStopWordFilter('तथा अयम्‌ एकम्‌ इत्यस्मिन्‌ तथा तत्‌ वा अयम्‌ इत्यस्य ते आहूत उपरि तेषाम्‌ किन्तु तेषाम्‌ तदा इत्यनेन अधिकः इत्यस्य तत्‌ केचन बहवः द्वि तथा महत्वपूर्णः अयम्‌ अस्य विषये अयं अस्ति तत्‌ प्रथमः विषये इत्युपरि इत्युपरि इतर अधिकतमः अधिकः अपि सामान्यतया ठ इतरेतर नूतनम्‌ द न्यूनम्‌ कश्चित्‌ वा विशालः द सः अस्ति तदनुसारम् तत्र अस्ति केवलम्‌ अपि अत्र सर्वे विविधाः तत्‌ बहवः यतः इदानीम्‌ द दक्षिण इत्यस्मै तस्य उपरि नथ अतीव कार्यम्‌ सर्वे एकैकम्‌ इत्यादि। एते सन्ति उत इत्थम्‌ मध्ये एतदर्थं . स कस्य प्रथमः श्री. करोति अस्मिन् प्रकारः निर्मिता कालः तत्र कर्तुं समान अधुना ते सन्ति स एकः अस्ति सः अर्थात् तेषां कृते . स्थितम् विशेषः अग्रिम तेषाम्‌ समान स्रोतः ख म समान इदानीमपि अधिकतया करोतु ते समान इत्यस्य वीथी सह यस्मिन् कृतवान्‌ धृतः तदा पुनः पूर्वं सः आगतः किम्‌ कुल इतर पुरा मात्रा स विषये उ अतएव अपि नगरस्य उपरि यतः प्रतिशतं कतरः कालः साधनानि भूत तथापि जात सम्बन्धि अन्यत्‌ ग अतः अस्माकं स्वकीयाः अस्माकं इदानीं अन्तः इत्यादयः भवन्तः इत्यादयः एते एताः तस्य अस्य इदम् एते तेषां तेषां तेषां तान् तेषां तेषां तेषां समानः सः एकः च तादृशाः बहवः अन्ये च वदन्ति यत् कियत् कस्मै कस्मै यस्मै यस्मै यस्मै यस्मै न अतिनीचः किन्तु प्रथमं सम्पूर्णतया ततः चिरकालानन्तरं पुस्तकं सम्पूर्णतया अन्तः किन्तु अत्र वा इह इव श्रद्धाय अवशिष्यते परन्तु अन्ये वर्गाः सन्ति ते सन्ति शक्नुवन्ति सर्वे मिलित्वा सर्वे एकत्र"'.split(" ")),e.sa.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var r=e.wordcut;r.init(),e.sa.tokenizer=function(t){if(!arguments.length||null==t||void 0==t)return[];if(Array.isArray(t))return t.map(function(r){return isLunr2?new e.Token(r.toLowerCase()):r.toLowerCase()});var i=t.toString().toLowerCase().replace(/^\s+/,"");return r.cut(i).split("|")},e.Pipeline.registerFunction(e.sa.stemmer,"stemmer-sa"),e.Pipeline.registerFunction(e.sa.stopWordFilter,"stopWordFilter-sa")}});
@@ -0,0 +1 @@
!function(r,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(r.lunr)}(this,function(){return function(r){r.stemmerSupport={Among:function(r,t,i,s){if(this.toCharArray=function(r){for(var t=r.length,i=new Array(t),s=0;s<t;s++)i[s]=r.charCodeAt(s);return i},!r&&""!=r||!t&&0!=t||!i)throw"Bad Among initialisation: s:"+r+", substring_i: "+t+", result: "+i;this.s_size=r.length,this.s=this.toCharArray(r),this.substring_i=t,this.result=i,this.method=s},SnowballProgram:function(){var r;return{bra:0,ket:0,limit:0,cursor:0,limit_backward:0,setCurrent:function(t){r=t,this.cursor=0,this.limit=t.length,this.limit_backward=0,this.bra=this.cursor,this.ket=this.limit},getCurrent:function(){var t=r;return r=null,t},in_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},in_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e<=s&&e>=i&&(e-=i,t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},out_grouping:function(t,i,s){if(this.cursor<this.limit){var e=r.charCodeAt(this.cursor);if(e>s||e<i)return this.cursor++,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor++,!0}return!1},out_grouping_b:function(t,i,s){if(this.cursor>this.limit_backward){var e=r.charCodeAt(this.cursor-1);if(e>s||e<i)return this.cursor--,!0;if(e-=i,!(t[e>>3]&1<<(7&e)))return this.cursor--,!0}return!1},eq_s:function(t,i){if(this.limit-this.cursor<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor+s)!=i.charCodeAt(s))return!1;return this.cursor+=t,!0},eq_s_b:function(t,i){if(this.cursor-this.limit_backward<t)return!1;for(var s=0;s<t;s++)if(r.charCodeAt(this.cursor-t+s)!=i.charCodeAt(s))return!1;return this.cursor-=t,!0},find_among:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=l;m<_.s_size;m++){if(n+l==u){f=-1;break}if(f=r.charCodeAt(n+l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n+_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n+_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},find_among_b:function(t,i){for(var s=0,e=i,n=this.cursor,u=this.limit_backward,o=0,h=0,c=!1;;){for(var a=s+(e-s>>1),f=0,l=o<h?o:h,_=t[a],m=_.s_size-1-l;m>=0;m--){if(n-l==u){f=-1;break}if(f=r.charCodeAt(n-1-l)-_.s[m])break;l++}if(f<0?(e=a,h=l):(s=a,o=l),e-s<=1){if(s>0||e==s||c)break;c=!0}}for(;;){var _=t[s];if(o>=_.s_size){if(this.cursor=n-_.s_size,!_.method)return _.result;var b=_.method();if(this.cursor=n-_.s_size,b)return _.result}if((s=_.substring_i)<0)return 0}},replace_s:function(t,i,s){var e=s.length-(i-t),n=r.substring(0,t),u=r.substring(i);return r=n+s+u,this.limit+=e,this.cursor>=i?this.cursor+=e:this.cursor>t&&(this.cursor=t),e},slice_check:function(){if(this.bra<0||this.bra>this.ket||this.ket>this.limit||this.limit>r.length)throw"faulty slice operation"},slice_from:function(r){this.slice_check(),this.replace_s(this.bra,this.ket,r)},slice_del:function(){this.slice_from("")},insert:function(r,t,i){var s=this.replace_s(r,t,i);r<=this.bra&&(this.bra+=s),r<=this.ket&&(this.ket+=s)},slice_to:function(){return this.slice_check(),r.substring(this.bra,this.ket)},eq_v_b:function(r){return this.eq_s_b(r.length,r)}}}},r.trimmerSupport={generateTrimmer:function(r){var t=new RegExp("^[^"+r+"]+"),i=new RegExp("[^"+r+"]+$");return function(r){return"function"==typeof r.update?r.update(function(r){return r.replace(t,"").replace(i,"")}):r.replace(t,"").replace(i,"")}}}}});
+18
View File
@@ -0,0 +1,18 @@
/*!
* Lunr languages, `Swedish` language
* https://github.com/MihaiValentin/lunr-languages
*
* Copyright 2014, Mihai Valentin
* http://www.mozilla.org/MPL/
*/
/*!
* based on
* Snowball JavaScript Library v0.3
* http://code.google.com/p/urim/
* http://snowball.tartarus.org/
*
* Copyright 2010, Oleg Mazko
* http://www.mozilla.org/MPL/
*/
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.sv=function(){this.pipeline.reset(),this.pipeline.add(e.sv.trimmer,e.sv.stopWordFilter,e.sv.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.sv.stemmer))},e.sv.wordCharacters="A-Za-zªºÀ-ÖØ-öø-ʸˠ-ˤᴀ-ᴥᴬ-ᵜᵢ-ᵥᵫ-ᵷᵹ-ᶾḀ-ỿⁱⁿₐ-ₜKÅℲⅎⅠ-ↈⱠ-ⱿꜢ-ꞇꞋ-ꞭꞰ-ꞷꟷ-ꟿꬰ-ꭚꭜ-ꭤff-stA-Za-z",e.sv.trimmer=e.trimmerSupport.generateTrimmer(e.sv.wordCharacters),e.Pipeline.registerFunction(e.sv.trimmer,"trimmer-sv"),e.sv.stemmer=function(){var r=e.stemmerSupport.Among,n=e.stemmerSupport.SnowballProgram,t=new function(){function e(){var e,r=w.cursor+3;if(o=w.limit,0<=r||r<=w.limit){for(a=r;;){if(e=w.cursor,w.in_grouping(l,97,246)){w.cursor=e;break}if(w.cursor=e,w.cursor>=w.limit)return;w.cursor++}for(;!w.out_grouping(l,97,246);){if(w.cursor>=w.limit)return;w.cursor++}o=w.cursor,o<a&&(o=a)}}function t(){var e,r=w.limit_backward;if(w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(u,37),w.limit_backward=r,e))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.in_grouping_b(d,98,121)&&w.slice_del()}}function i(){var e=w.limit_backward;w.cursor>=o&&(w.limit_backward=o,w.cursor=w.limit,w.find_among_b(c,7)&&(w.cursor=w.limit,w.ket=w.cursor,w.cursor>w.limit_backward&&(w.bra=--w.cursor,w.slice_del())),w.limit_backward=e)}function s(){var e,r;if(w.cursor>=o){if(r=w.limit_backward,w.limit_backward=o,w.cursor=w.limit,w.ket=w.cursor,e=w.find_among_b(m,5))switch(w.bra=w.cursor,e){case 1:w.slice_del();break;case 2:w.slice_from("lös");break;case 3:w.slice_from("full")}w.limit_backward=r}}var a,o,u=[new r("a",-1,1),new r("arna",0,1),new r("erna",0,1),new r("heterna",2,1),new r("orna",0,1),new r("ad",-1,1),new r("e",-1,1),new r("ade",6,1),new r("ande",6,1),new r("arne",6,1),new r("are",6,1),new r("aste",6,1),new r("en",-1,1),new r("anden",12,1),new r("aren",12,1),new r("heten",12,1),new r("ern",-1,1),new r("ar",-1,1),new r("er",-1,1),new r("heter",18,1),new r("or",-1,1),new r("s",-1,2),new r("as",21,1),new r("arnas",22,1),new r("ernas",22,1),new r("ornas",22,1),new r("es",21,1),new r("ades",26,1),new r("andes",26,1),new r("ens",21,1),new r("arens",29,1),new r("hetens",29,1),new r("erns",21,1),new r("at",-1,1),new r("andet",-1,1),new r("het",-1,1),new r("ast",-1,1)],c=[new r("dd",-1,-1),new r("gd",-1,-1),new r("nn",-1,-1),new r("dt",-1,-1),new r("gt",-1,-1),new r("kt",-1,-1),new r("tt",-1,-1)],m=[new r("ig",-1,1),new r("lig",0,1),new r("els",-1,1),new r("fullt",-1,3),new r("löst",-1,2)],l=[17,65,16,1,0,0,0,0,0,0,0,0,0,0,0,0,24,0,32],d=[119,127,149],w=new n;this.setCurrent=function(e){w.setCurrent(e)},this.getCurrent=function(){return w.getCurrent()},this.stem=function(){var r=w.cursor;return e(),w.limit_backward=r,w.cursor=w.limit,t(),w.cursor=w.limit,i(),w.cursor=w.limit,s(),!0}};return function(e){return"function"==typeof e.update?e.update(function(e){return t.setCurrent(e),t.stem(),t.getCurrent()}):(t.setCurrent(e),t.stem(),t.getCurrent())}}(),e.Pipeline.registerFunction(e.sv.stemmer,"stemmer-sv"),e.sv.stopWordFilter=e.generateStopWordFilter("alla allt att av blev bli blir blivit de dem den denna deras dess dessa det detta dig din dina ditt du där då efter ej eller en er era ert ett från för ha hade han hans har henne hennes hon honom hur här i icke ingen inom inte jag ju kan kunde man med mellan men mig min mina mitt mot mycket ni nu när någon något några och om oss på samma sedan sig sin sina sitta själv skulle som så sådan sådana sådant till under upp ut utan vad var vara varför varit varje vars vart vem vi vid vilka vilkas vilken vilket vår våra vårt än är åt över".split(" ")),e.Pipeline.registerFunction(e.sv.stopWordFilter,"stopWordFilter-sv")}});
+1
View File
@@ -0,0 +1 @@
!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.ta=function(){this.pipeline.reset(),this.pipeline.add(e.ta.trimmer,e.ta.stopWordFilter,e.ta.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.ta.stemmer))},e.ta.wordCharacters="஀-உஊ-ஏஐ-ஙச-ட஠-னப-யர-ஹ஺-ிீ-௉ொ-௏ௐ-௙௚-௟௠-௩௪-௯௰-௹௺-௿a-zA-Z-zA-0-9-",e.ta.trimmer=e.trimmerSupport.generateTrimmer(e.ta.wordCharacters),e.Pipeline.registerFunction(e.ta.trimmer,"trimmer-ta"),e.ta.stopWordFilter=e.generateStopWordFilter("அங்கு அங்கே அது அதை அந்த அவர் அவர்கள் அவள் அவன் அவை ஆக ஆகவே ஆகையால் ஆதலால் ஆதலினால் ஆனாலும் ஆனால் இங்கு இங்கே இது இதை இந்த இப்படி இவர் இவர்கள் இவள் இவன் இவை இவ்வளவு உனக்கு உனது உன் உன்னால் எங்கு எங்கே எது எதை எந்த எப்படி எவர் எவர்கள் எவள் எவன் எவை எவ்வளவு எனக்கு எனது எனவே என் என்ன என்னால் ஏது ஏன் தனது தன்னால் தானே தான் நாங்கள் நாம் நான் நீ நீங்கள்".split(" ")),e.ta.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.ta.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.ta.stemmer,"stemmer-ta"),e.Pipeline.registerFunction(e.ta.stopWordFilter,"stopWordFilter-ta")}});
+1
View File
@@ -0,0 +1 @@
!function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():t()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.te=function(){this.pipeline.reset(),this.pipeline.add(e.te.trimmer,e.te.stopWordFilter,e.te.stemmer),this.searchPipeline&&(this.searchPipeline.reset(),this.searchPipeline.add(e.te.stemmer))},e.te.wordCharacters="ఀ-ఄఅ-ఔక-హా-ౌౕ-ౖౘ-ౚౠ-ౡౢ-ౣ౦-౯౸-౿఼ఽ్ౝ౷౤౥",e.te.trimmer=e.trimmerSupport.generateTrimmer(e.te.wordCharacters),e.Pipeline.registerFunction(e.te.trimmer,"trimmer-te"),e.te.stopWordFilter=e.generateStopWordFilter("అందరూ అందుబాటులో అడగండి అడగడం అడ్డంగా అనుగుణంగా అనుమతించు అనుమతిస్తుంది అయితే ఇప్పటికే ఉన్నారు ఎక్కడైనా ఎప్పుడు ఎవరైనా ఎవరో ఏ ఏదైనా ఏమైనప్పటికి ఒక ఒకరు కనిపిస్తాయి కాదు కూడా గా గురించి చుట్టూ చేయగలిగింది తగిన తర్వాత దాదాపు దూరంగా నిజంగా పై ప్రకారం ప్రక్కన మధ్య మరియు మరొక మళ్ళీ మాత్రమే మెచ్చుకో వద్ద వెంట వేరుగా వ్యతిరేకంగా సంబంధం".split(" ")),e.te.stemmer=function(){return function(e){return"function"==typeof e.update?e.update(function(e){return e}):e}}();var t=e.wordcut;t.init(),e.te.tokenizer=function(r){if(!arguments.length||null==r||void 0==r)return[];if(Array.isArray(r))return r.map(function(t){return isLunr2?new e.Token(t.toLowerCase()):t.toLowerCase()});var i=r.toString().toLowerCase().replace(/^\s+/,"");return t.cut(i).split("|")},e.Pipeline.registerFunction(e.te.stemmer,"stemmer-te"),e.Pipeline.registerFunction(e.te.stopWordFilter,"stopWordFilter-te")}});
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var r="2"==e.version[0];e.th=function(){this.pipeline.reset(),this.pipeline.add(e.th.trimmer),r?this.tokenizer=e.th.tokenizer:(e.tokenizer&&(e.tokenizer=e.th.tokenizer),this.tokenizerFn&&(this.tokenizerFn=e.th.tokenizer))},e.th.wordCharacters="[฀-๿]",e.th.trimmer=e.trimmerSupport.generateTrimmer(e.th.wordCharacters),e.Pipeline.registerFunction(e.th.trimmer,"trimmer-th");var t=e.wordcut;t.init(),e.th.tokenizer=function(i){if(!arguments.length||null==i||void 0==i)return[];if(Array.isArray(i))return i.map(function(t){return r?new e.Token(t):t});var n=i.toString().replace(/^\s+/,"");return t.cut(n).split("|")}}});
File diff suppressed because one or more lines are too long
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r():r()(e.lunr)}(this,function(){return function(e){if(void 0===e)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===e.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");e.vi=function(){this.pipeline.reset(),this.pipeline.add(e.vi.stopWordFilter,e.vi.trimmer)},e.vi.wordCharacters="[A-Za-ẓ̀͐́͑̉̃̓ÂâÊêÔôĂ-ăĐ-đƠ-ơƯ-ư]",e.vi.trimmer=e.trimmerSupport.generateTrimmer(e.vi.wordCharacters),e.Pipeline.registerFunction(e.vi.trimmer,"trimmer-vi"),e.vi.stopWordFilter=e.generateStopWordFilter("là cái nhưng mà".split(" "))}});
+1
View File
@@ -0,0 +1 @@
!function(e,r){"function"==typeof define&&define.amd?define(r):"object"==typeof exports?module.exports=r(require("@node-rs/jieba")):r()(e.lunr)}(this,function(e){return function(r,t){if(void 0===r)throw new Error("Lunr is not present. Please include / require Lunr before this script.");if(void 0===r.stemmerSupport)throw new Error("Lunr stemmer support is not present. Please include / require Lunr stemmer support before this script.");var i="2"==r.version[0];r.zh=function(){this.pipeline.reset(),this.pipeline.add(r.zh.trimmer,r.zh.stopWordFilter,r.zh.stemmer),i?this.tokenizer=r.zh.tokenizer:(r.tokenizer&&(r.tokenizer=r.zh.tokenizer),this.tokenizerFn&&(this.tokenizerFn=r.zh.tokenizer))},r.zh.tokenizer=function(n){if(!arguments.length||null==n||void 0==n)return[];if(Array.isArray(n))return n.map(function(e){return i?new r.Token(e.toLowerCase()):e.toLowerCase()});t&&e.load(t);var o=n.toString().trim().toLowerCase(),s=[];e.cut(o,!0).forEach(function(e){s=s.concat(e.split(" "))}),s=s.filter(function(e){return!!e});var u=0;return s.map(function(e,t){if(i){var n=o.indexOf(e,u),s={};return s.position=[n,e.length],s.index=t,u=n,new r.Token(e,s)}return e})},r.zh.wordCharacters="\\w一-龥",r.zh.trimmer=r.trimmerSupport.generateTrimmer(r.zh.wordCharacters),r.Pipeline.registerFunction(r.zh.trimmer,"trimmer-zh"),r.zh.stemmer=function(){return function(e){return e}}(),r.Pipeline.registerFunction(r.zh.stemmer,"stemmer-zh"),r.zh.stopWordFilter=r.generateStopWordFilter("的 一 不 在 人 有 是 为 為 以 于 於 上 他 而 后 後 之 来 來 及 了 因 下 可 到 由 这 這 与 與 也 此 但 并 並 个 個 其 已 无 無 小 我 们 們 起 最 再 今 去 好 只 又 或 很 亦 某 把 那 你 乃 它 吧 被 比 别 趁 当 當 从 從 得 打 凡 儿 兒 尔 爾 该 該 各 给 給 跟 和 何 还 還 即 几 幾 既 看 据 據 距 靠 啦 另 么 麽 每 嘛 拿 哪 您 凭 憑 且 却 卻 让 讓 仍 啥 如 若 使 谁 誰 虽 雖 随 隨 同 所 她 哇 嗡 往 些 向 沿 哟 喲 用 咱 则 則 怎 曾 至 致 着 著 诸 諸 自".split(" ")),r.Pipeline.registerFunction(r.zh.stopWordFilter,"stopWordFilter-zh")}});
+206
View File
@@ -0,0 +1,206 @@
/**
* export the module via AMD, CommonJS or as a browser global
* Export code from https://github.com/umdjs/umd/blob/master/returnExports.js
*/
;(function (root, factory) {
if (typeof define === 'function' && define.amd) {
// AMD. Register as an anonymous module.
define(factory)
} else if (typeof exports === 'object') {
/**
* Node. Does not work with strict CommonJS, but
* only CommonJS-like environments that support module.exports,
* like Node.
*/
module.exports = factory()
} else {
// Browser globals (root is window)
factory()(root.lunr);
}
}(this, function () {
/**
* Just return a value to define the module export.
* This example returns an object, but the module
* can return a function as the exported value.
*/
return function(lunr) {
// TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
// (c) 2008 Taku Kudo <taku@chasen.org>
// TinySegmenter is freely distributable under the terms of a new BSD licence.
// For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
function TinySegmenter() {
var patterns = {
"[一二三四五六七八九十百千万億兆]":"M",
"[一-龠々〆ヵヶ]":"H",
"[ぁ-ん]":"I",
"[ァ-ヴーア-ン゙ー]":"K",
"[a-zA-Z-zA-]":"A",
"[0-9-]":"N"
}
this.chartype_ = [];
for (var i in patterns) {
var regexp = new RegExp(i);
this.chartype_.push([regexp, patterns[i]]);
}
this.BIAS__ = -332
this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378};
this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920};
this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266};
this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352};
this.BP2__ = {"BO":60,"OO":-1762};
this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965};
this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146};
this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699};
this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973};
this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682};
this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669};
this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990};
this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832};
this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649};
this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393};
this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841};
this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68};
this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591};
this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685};
this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156};
this.TW1__ = {"につい":-4681,"東京都":2026};
this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216};
this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287};
this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865};
this.UC1__ = {"A":484,"K":93,"M":645,"O":-505};
this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646};
this.UC3__ = {"A":-1370,"I":2311};
this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646};
this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831};
this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387};
this.UP1__ = {"O":-214};
this.UP2__ = {"B":69,"O":935};
this.UP3__ = {"B":189};
this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422};
this.UQ2__ = {"BH":216,"BI":113,"OK":1759};
this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212};
this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135};
this.UW2__ = {",":-829,"、":-829,"":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568};
this.UW3__ = {",":4889,"1":-800,"":-1723,"、":4889,"々":-2311,"":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278};
this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637};
this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"":-514,"E2":-32768,"「":363,"イ":241,"ル":451,"ン":-343};
this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"":-270,"E1":306,"ル":-673,"ン":-496};
return this;
}
TinySegmenter.prototype.ctype_ = function(str) {
for (var i in this.chartype_) {
if (str.match(this.chartype_[i][0])) {
return this.chartype_[i][1];
}
}
return "O";
}
TinySegmenter.prototype.ts_ = function(v) {
if (v) { return v; }
return 0;
}
TinySegmenter.prototype.segment = function(input) {
if (input == null || input == undefined || input == "") {
return [];
}
var result = [];
var seg = ["B3","B2","B1"];
var ctype = ["O","O","O"];
var o = input.split("");
for (i = 0; i < o.length; ++i) {
seg.push(o[i]);
ctype.push(this.ctype_(o[i]))
}
seg.push("E1");
seg.push("E2");
seg.push("E3");
ctype.push("O");
ctype.push("O");
ctype.push("O");
var word = seg[3];
var p1 = "U";
var p2 = "U";
var p3 = "U";
for (var i = 4; i < seg.length - 3; ++i) {
var score = this.BIAS__;
var w1 = seg[i-3];
var w2 = seg[i-2];
var w3 = seg[i-1];
var w4 = seg[i];
var w5 = seg[i+1];
var w6 = seg[i+2];
var c1 = ctype[i-3];
var c2 = ctype[i-2];
var c3 = ctype[i-1];
var c4 = ctype[i];
var c5 = ctype[i+1];
var c6 = ctype[i+2];
score += this.ts_(this.UP1__[p1]);
score += this.ts_(this.UP2__[p2]);
score += this.ts_(this.UP3__[p3]);
score += this.ts_(this.BP1__[p1 + p2]);
score += this.ts_(this.BP2__[p2 + p3]);
score += this.ts_(this.UW1__[w1]);
score += this.ts_(this.UW2__[w2]);
score += this.ts_(this.UW3__[w3]);
score += this.ts_(this.UW4__[w4]);
score += this.ts_(this.UW5__[w5]);
score += this.ts_(this.UW6__[w6]);
score += this.ts_(this.BW1__[w2 + w3]);
score += this.ts_(this.BW2__[w3 + w4]);
score += this.ts_(this.BW3__[w4 + w5]);
score += this.ts_(this.TW1__[w1 + w2 + w3]);
score += this.ts_(this.TW2__[w2 + w3 + w4]);
score += this.ts_(this.TW3__[w3 + w4 + w5]);
score += this.ts_(this.TW4__[w4 + w5 + w6]);
score += this.ts_(this.UC1__[c1]);
score += this.ts_(this.UC2__[c2]);
score += this.ts_(this.UC3__[c3]);
score += this.ts_(this.UC4__[c4]);
score += this.ts_(this.UC5__[c5]);
score += this.ts_(this.UC6__[c6]);
score += this.ts_(this.BC1__[c2 + c3]);
score += this.ts_(this.BC2__[c3 + c4]);
score += this.ts_(this.BC3__[c4 + c5]);
score += this.ts_(this.TC1__[c1 + c2 + c3]);
score += this.ts_(this.TC2__[c2 + c3 + c4]);
score += this.ts_(this.TC3__[c3 + c4 + c5]);
score += this.ts_(this.TC4__[c4 + c5 + c6]);
// score += this.ts_(this.TC5__[c4 + c5 + c6]);
score += this.ts_(this.UQ1__[p1 + c1]);
score += this.ts_(this.UQ2__[p2 + c2]);
score += this.ts_(this.UQ3__[p3 + c3]);
score += this.ts_(this.BQ1__[p2 + c2 + c3]);
score += this.ts_(this.BQ2__[p2 + c3 + c4]);
score += this.ts_(this.BQ3__[p3 + c2 + c3]);
score += this.ts_(this.BQ4__[p3 + c3 + c4]);
score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]);
score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]);
score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]);
score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]);
var p = "O";
if (score > 0) {
result.push(word);
word = "";
p = "B";
}
p1 = p2;
p2 = p3;
p3 = p;
word += seg[i];
}
result.push(word);
return result;
}
lunr.TinySegmenter = TinySegmenter;
};
}));
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
{"version":3,"sources":["src/templates/assets/stylesheets/palette/_scheme.scss","../../../../src/templates/assets/stylesheets/palette.scss","src/templates/assets/stylesheets/palette/_accent.scss","src/templates/assets/stylesheets/palette/_primary.scss","src/templates/assets/stylesheets/utilities/_break.scss"],"names":[],"mappings":"AA2BA,cAGE,6BAME,sDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CACA,mDAAA,CACA,6DAAA,CACA,+DAAA,CACA,gEAAA,CAGA,mDAAA,CACA,gDAAA,CACA,yDAAA,CACA,4DAAA,CAGA,0BAAA,CACA,mCAAA,CAGA,iCAAA,CACA,kCAAA,CACA,mCAAA,CACA,mCAAA,CACA,kCAAA,CACA,iCAAA,CACA,+CAAA,CACA,6DAAA,CACA,gEAAA,CACA,4DAAA,CACA,4DAAA,CACA,6DAAA,CAGA,6CAAA,CAGA,+CAAA,CAGA,uDAAA,CACA,6DAAA,CACA,2DAAA,CAGA,iCAAA,CAGA,yDAAA,CACA,iEAAA,CAGA,mDAAA,CACA,mDAAA,CAGA,qDAAA,CACA,uDAAA,CAGA,8DAAA,CAKA,8DAAA,CAKA,0DAAA,CAzEA,iBCiBF,CD6DE,kHAEE,YC3DJ,CDkFE,yDACE,4BChFJ,CD+EE,2DACE,4BC7EJ,CD4EE,gEACE,4BC1EJ,CDyEE,2DACE,4BCvEJ,CDsEE,yDACE,4BCpEJ,CDmEE,0DACE,4BCjEJ,CDgEE,gEACE,4BC9DJ,CD6DE,0DACE,4BC3DJ,CD0DE,2OACE,4BC/CJ,CDsDA,+FAGE,iCCpDF,CACF,CCjDE,2BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD6CN,CCvDE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDoDN,CC9DE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD2DN,CCrEE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDkEN,CC5EE,8BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDyEN,CCnFE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDgFN,CC1FE,kCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDuFN,CCjGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD8FN,CCxGE,4BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDqGN,CC/GE,6BACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCD4GN,CCtHE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDmHN,CC7HE,4BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD6HN,CCpIE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDoIN,CC3IE,6BACE,yBAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCD2IN,CClJE,8BACE,4BAAA,CACA,2CAAA,CAIE,8BAAA,CACA,qCDkJN,CCzJE,mCACE,4BAAA,CACA,2CAAA,CAOE,yBAAA,CACA,qCDsJN,CE3JE,4BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwJN,CEnKE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgKN,CE3KE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwKN,CEnLE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgLN,CE3LE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwLN,CEnME,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgMN,CE3ME,mCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwMN,CEnNE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgNN,CE3NE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwNN,CEnOE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgON,CE3OE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwON,CEnPE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFmPN,CE3PE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF2PN,CEnQE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCFmQN,CE3QE,+BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAIE,+BAAA,CACA,sCF2QN,CEnRE,oCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFgRN,CE3RE,8BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCFwRN,CEnSE,6BACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BF4RN,CE5SE,kCACE,6BAAA,CACA,oCAAA,CACA,mCAAA,CAOE,0BAAA,CACA,sCAAA,CAKA,4BFqSN,CEtRE,sEACE,4BFyRJ,CE1RE,+DACE,4BF6RJ,CE9RE,iEACE,4BFiSJ,CElSE,gEACE,4BFqSJ,CEtSE,iEACE,4BFySJ,CEhSA,8BACE,mDAAA,CACA,4DAAA,CACA,0DAAA,CACA,oDAAA,CACA,2DAAA,CAGA,4BFiSF,CE9RE,yCACE,+BFgSJ,CE7RI,kDAEE,0CAAA,CACA,sCAAA,CAFA,mCFiSN,CG7MI,mCD1EA,+CACE,8CF0RJ,CEvRI,qDACE,8CFyRN,CEpRE,iEACE,mCFsRJ,CACF,CGxNI,sCDvDA,uCACE,oCFkRJ,CACF,CEzQA,8BACE,kDAAA,CACA,4DAAA,CACA,wDAAA,CACA,oDAAA,CACA,6DAAA,CAGA,4BF0QF,CEvQE,yCACE,+BFyQJ,CEtQI,kDAEE,0CAAA,CACA,sCAAA,CAFA,mCF0QN,CEnQE,yCACE,6CFqQJ,CG9NI,0CDhCA,8CACE,gDFiQJ,CACF,CGnOI,0CDvBA,iFACE,6CF6PJ,CACF,CG3PI,sCDKA,uCACE,6CFyPJ,CACF","file":"palette.css"}
+35
View File
@@ -0,0 +1,35 @@
/* docs/css/extra.css */
/* Styles principaux pour le conteneur et le texte */
.ps-root {
font-family: "Courier New", monospace;
font-size: 0.9em;
line-height: 1.5;
}
/* Styles pour les mots-clés */
.ps-keyword {
font-weight: bold;
color: #d73a49; /* Une belle teinte de rouge */
}
/* --- CORRECTION DE L'INDENTATION --- */
/* Cible tous les niveaux d'indentation et applique une marge gauche */
[class*="ps-indent-"] {
display: inline-block;
}
.ps-indent-1 {
margin-left: 2em;
}
.ps-indent-2 {
margin-left: 4em;
}
.ps-indent-3 {
margin-left: 6em;
}
.ps-indent-4 {
margin-left: 8em;
}
.ps-indent-5 {
margin-left: 10em;
}
+230
View File
@@ -0,0 +1,230 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" default-locale="en-US">
<info>
<title>Ecology Letters</title>
<id>http://www.zotero.org/styles/ecology-letters</id>
<link href="http://www.zotero.org/styles/ecology-letters" rel="self"/>
<link href="http://www.zotero.org/styles/apa" rel="template"/>
<link href="http://onlinelibrary.wiley.com/journal/10.1111/%28ISSN%291461-0248/homepage/ForAuthors.html" rel="documentation"/>
<author>
<name>David Kaplan</name>
<email>david.kaplan@ird.fr</email>
</author>
<contributor>
<name>Sebastian Karcher</name>
</contributor>
<category citation-format="author-date"/>
<category field="biology"/>
<issn>1461-023X</issn>
<eissn>1461-0248</eissn>
<updated>2023-10-11T10:45:32+00:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
<macro name="container">
<choose>
<if type="chapter paper-conference" match="any">
<text term="in" text-case="capitalize-first" suffix=": "/>
<text variable="container-title" font-style="italic"/>
<text variable="collection-title" prefix=", "/>
<names variable="editor translator" prefix=" (" delimiter=", " suffix=")">
<label form="short" suffix=" "/>
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
</names>
</if>
<else>
<group delimiter=", ">
<text variable="container-title" font-style="italic" form="short"/>
<text variable="collection-title"/>
</group>
</else>
</choose>
</macro>
<macro name="author">
<names variable="author">
<name name-as-sort-order="all" and="symbol" sort-separator=", " initialize-with="." delimiter=", " delimiter-precedes-last="never"/>
<label form="short" prefix=" (" suffix=")" text-case="capitalize-first"/>
<et-al font-style="italic"/>
<substitute>
<names variable="editor"/>
<names variable="translator"/>
<text macro="title"/>
</substitute>
</names>
</macro>
<macro name="author-short">
<names variable="author">
<name form="short" and="symbol" delimiter=", " initialize-with=". "/>
<et-al font-style="italic"/>
<substitute>
<names variable="editor"/>
<names variable="translator"/>
<choose>
<if type="bill book graphic legal_case legislation motion_picture report song" match="any">
<text variable="title" form="short" font-style="italic"/>
</if>
<else>
<text variable="title" form="short" quotes="true"/>
</else>
</choose>
</substitute>
</names>
</macro>
<macro name="access">
<choose>
<if type="webpage">
<group>
<text term="available at" text-case="capitalize-first" suffix=": "/>
<text variable="URL" suffix="."/>
</group>
<text value="Last accessed" prefix=" " suffix=" "/>
<date variable="accessed">
<date-part name="day" suffix=" "/>
<date-part name="month" suffix=" "/>
<date-part name="year"/>
</date>
</if>
</choose>
</macro>
<macro name="title">
<choose>
<if type="report" match="any">
<text variable="title" font-style="italic"/>
<group prefix=" (" suffix=")">
<text variable="genre"/>
<text variable="number" prefix=" No. "/>
</group>
</if>
<else-if type="bill book graphic legal_case legislation motion_picture report song speech" match="any">
<text variable="title" font-style="italic"/>
</else-if>
<else-if type="webpage">
<text variable="title" font-style="italic"/>
</else-if>
<else>
<text variable="title"/>
</else>
</choose>
</macro>
<macro name="publisher">
<choose>
<if type="report" match="any">
<group delimiter=", ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</if>
<else>
<text variable="genre" suffix=". "/>
<group delimiter=", ">
<text variable="publisher"/>
<text variable="publisher-place"/>
</group>
</else>
</choose>
</macro>
<macro name="event">
<choose>
<if variable="event">
<text term="presented at" text-case="capitalize-first" suffix=" "/>
<text variable="event"/>
</if>
</choose>
</macro>
<macro name="issued">
<choose>
<if variable="issued">
<date variable="issued">
<date-part name="year"/>
</date>
</if>
<else-if variable="accessed">
<choose>
<if type="webpage">
<date variable="accessed">
<date-part name="year"/>
</date>
</if>
<else>
<text term="no date" form="short"/>
</else>
</choose>
</else-if>
<else>
<text term="no date" form="short"/>
</else>
</choose>
</macro>
<macro name="edition">
<choose>
<if is-numeric="edition">
<group delimiter=" ">
<number variable="edition" form="ordinal"/>
<text value="edn"/>
</group>
</if>
<else>
<text variable="edition" suffix="."/>
</else>
</choose>
</macro>
<macro name="locators">
<choose>
<if type="article-journal article-magazine article-newspaper" match="any">
<group prefix=", " delimiter=", ">
<group>
<text variable="volume"/>
</group>
<text variable="page"/>
</group>
</if>
<else-if type="bill book graphic legal_case legislation motion_picture report song thesis" match="any">
<group delimiter=". " prefix=". ">
<text macro="edition"/>
<text macro="event"/>
<text macro="publisher"/>
</group>
</else-if>
<else-if type="chapter paper-conference" match="any">
<group delimiter=", " prefix=". ">
<text macro="event"/>
<text macro="publisher"/>
<group>
<label variable="page" form="short" suffix=" "/>
<text variable="page"/>
</group>
</group>
</else-if>
</choose>
</macro>
<citation et-al-min="3" et-al-use-first="1" disambiguate-add-year-suffix="true" collapse="year-suffix" year-suffix-delimiter=", ">
<sort>
<key macro="author"/>
<key macro="issued"/>
</sort>
<layout prefix="(" suffix=")" delimiter="; ">
<group delimiter=" ">
<text macro="author-short"/>
<text macro="issued"/>
</group>
</layout>
</citation>
<bibliography et-al-min="7" et-al-use-first="6" entry-spacing="0" hanging-indent="true">
<sort>
<key macro="author"/>
<key macro="issued" sort="ascending"/>
<key macro="title"/>
</sort>
<layout>
<group suffix=".">
<text macro="author" suffix="."/>
<text macro="issued" prefix=" (" suffix="). "/>
<group delimiter=". ">
<text macro="title"/>
<text macro="container"/>
</group>
<text macro="locators"/>
<text macro="access" prefix=". "/>
</group>
</layout>
</bibliography>
</style>
File diff suppressed because it is too large Load Diff
+963
View File
@@ -0,0 +1,963 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../superkmer/">
<link rel="next" href="../chunkreader/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Kmer - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#kmer-implementation" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Kmer
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Kmer
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Kmer
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#encoding" class="md-nav__link">
<span class="md-ellipsis">
Encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#decoding" class="md-nav__link">
<span class="md-ellipsis">
Decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#canonical-form" class="md-nav__link">
<span class="md-ellipsis">
Canonical form
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#encoding" class="md-nav__link">
<span class="md-ellipsis">
Encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#decoding" class="md-nav__link">
<span class="md-ellipsis">
Decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#canonical-form" class="md-nav__link">
<span class="md-ellipsis">
Canonical form
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="kmer-implementation">Kmer — implementation</h1>
<h2 id="memory-layout">Memory layout</h2>
<p><code>Kmer</code> is a <code>#[repr(transparent)]</code> newtype over <code>u64</code>:</p>
<div class="highlight"><pre><span></span><code><span class="cp">#[repr(transparent)]</span>
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">Kmer</span><span class="p">(</span><span class="kt">u64</span><span class="p">);</span>
</code></pre></div>
<p>Nucleotides are packed 2 bits each, <strong>left-aligned</strong>, MSB-first. Nucleotide 0 occupies bits 6362; nucleotide i occupies bits 632i and 622i. The low 642k bits are always zero. k is <strong>not stored</strong> — it is a parameter of every operation that needs it, and will be owned by the future collection-level indexer.</p>
<table>
<thead>
<tr>
<th>6362</th>
<th>6160</th>
<th></th>
<th>632(k1)1 to 632(k1)</th>
<th>632k down to 0</th>
</tr>
</thead>
<tbody>
<tr>
<td>nt 0</td>
<td>nt 1</td>
<td></td>
<td>nt k1</td>
<td>zero padding</td>
</tr>
</tbody>
</table>
<h2 id="encoding">Encoding</h2>
<p><code>Kmer::from_ascii(ascii, k)</code> encodes the first k bytes of an ASCII slice using the shared <code>ENC</code> table (see <a href="../superkmer/#ascii-encoding-and-decoding">SuperKmer — ASCII encoding</a>):</p>
<div class="highlight"><pre><span></span><code><span class="k">for</span><span class="w"> </span><span class="n">i</span><span class="w"> </span><span class="k">in</span><span class="w"> </span><span class="mi">0</span><span class="o">..</span><span class="n">k</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">val</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">encode_base</span><span class="p">(</span><span class="n">ascii</span><span class="p">[</span><span class="n">i</span><span class="p">])</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">u64</span><span class="p">;</span>
<span class="p">}</span>
<span class="n">Kmer</span><span class="p">(</span><span class="n">val</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
</code></pre></div>
<p>Zero allocation — result lives on the stack.</p>
<h2 id="decoding">Decoding</h2>
<p><code>write_ascii(k, buf)</code> appends k ASCII characters to a caller-supplied <code>Vec&lt;u8&gt;</code> using the shared <code>DEC4</code> table: one lookup per 4 nucleotides, two partial-byte lookups for the remainder. No allocation in the hot path.</p>
<p><code>to_ascii(k)</code> is a convenience wrapper that allocates and returns a <code>Vec&lt;u8&gt;</code>; intended for tests and display only.</p>
<h2 id="reverse-complement">Reverse complement</h2>
<p>Computed as pure arithmetic — no lookup table, no memory access:</p>
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement</span>
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">x</span><span class="p">.</span><span class="n">swap_bytes</span><span class="p">();</span><span class="w"> </span><span class="c1">// reverse bytes</span>
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x0F0F0F0F0F0F0F0F</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
<span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x3333333333333333</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
<span class="n">Kmer</span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="mi">64</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="n">k</span><span class="p">))</span>
</code></pre></div>
<p>After complementing, bytes are reversed (<code>swap_bytes</code>), then nibbles, then 2-bit groups — restoring 2-bit nucleotides to their correct positions in reverse order. A final left-shift realigns to MSB. Zero allocation — result lives on the stack.</p>
<h2 id="canonical-form">Canonical form</h2>
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">canonical</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">Self</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">revcomp</span><span class="p">(</span><span class="n">k</span><span class="p">);</span>
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&lt;=</span><span class="w"> </span><span class="n">rc</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="o">*</span><span class="bp">self</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">rc</span><span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</code></pre></div>
<p>Lexicographic minimum of forward and reverse-complement, comparing the raw <code>u64</code> values directly (left-aligned encoding makes this equivalent to nucleotide-wise comparison). Zero allocation — result lives on the stack.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+947
View File
@@ -0,0 +1,947 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../storage/">
<link rel="next" href="../../architecture/sequences/invariant/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>MPHF selection - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#mphf-selection-analysis-in-progress" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
MPHF selection
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
MPHF selection
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#candidates" class="md-nav__link">
<span class="md-ellipsis">
Candidates
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#space-at-scale" class="md-nav__link">
<span class="md-ellipsis">
Space at scale
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#on-disk-and-mmap-considerations" class="md-nav__link">
<span class="md-ellipsis">
On-disk and mmap considerations
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#open-questions" class="md-nav__link">
<span class="md-ellipsis">
Open questions
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#candidates" class="md-nav__link">
<span class="md-ellipsis">
Candidates
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#space-at-scale" class="md-nav__link">
<span class="md-ellipsis">
Space at scale
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#on-disk-and-mmap-considerations" class="md-nav__link">
<span class="md-ellipsis">
On-disk and mmap considerations
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#open-questions" class="md-nav__link">
<span class="md-ellipsis">
Open questions
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="mphf-selection-analysis-in-progress">MPHF selection — analysis in progress</h1>
<p>The choice of Minimal Perfect Hash Function for phase 6 is not yet settled. Three candidates were evaluated.</p>
<h2 id="candidates">Candidates</h2>
<p><strong>boomphf</strong> (BBHash algorithm, maintained by 10X Genomics):</p>
<ul>
<li>~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)</li>
<li>Parallel construction; well-tested with DNA kmer data at scale</li>
<li>Drawback: largest space footprint of the three</li>
</ul>
<p><strong>ptr_hash</strong> (PtrHash algorithm, Groot Koerkamp, SEA 2025):</p>
<ul>
<li>~2.4 bits/key; fastest queries (≥2.1× over alternatives, 812 ns/key for u64 in tight loops) and fastest construction (≥3.1×)</li>
<li>Theoretical foundation solid; paper and Rust crate from the same author</li>
<li>Drawback: published February 2025 — very young, no production track record</li>
</ul>
<p><strong>FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
<ul>
<li>~2.1 bits/key — most compact of the three; good query speed; parallelisable construction</li>
<li>More established than ptr_hash; actively maintained</li>
<li>Currently preferred candidate</li>
</ul>
<h2 id="space-at-scale">Space at scale</h2>
<p>For 1 024 partitions × 100 M kmers/partition:</p>
<table>
<thead>
<tr>
<th>MPHF</th>
<th>bits/key</th>
<th>Total MPHF size</th>
</tr>
</thead>
<tbody>
<tr>
<td>boomphf</td>
<td>3.7</td>
<td>~47 GB</td>
</tr>
<tr>
<td>ptr_hash</td>
<td>2.4</td>
<td>~31 GB</td>
</tr>
<tr>
<td>FMPHGO</td>
<td>2.1</td>
<td>~27 GB</td>
</tr>
</tbody>
</table>
<p>In practice, partition sizes depend on the dataset. For a human genome at 30× coverage with p=10 (1 024 partitions), realistic partition sizes are 330 M kmers → 18 MB per MPHF, well within RAM.</p>
<h2 id="on-disk-and-mmap-considerations">On-disk and mmap considerations</h2>
<p>All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the <code>ph</code> crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 18 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.</p>
<p>No established Rust crate provides a natively on-disk MPHF. <strong>SSHash</strong> (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.</p>
<h2 id="open-questions">Open questions</h2>
<ul>
<li>Confirm actual partition sizes on representative metagenomic datasets before fixing the choice.</li>
<li>Evaluate whether ptr_hash's query speed advantage (2.13.3×) justifies adopting a crate that is less than a year old.</li>
<li>Assess rkyv integration cost for FMPHGO if true zero-copy mmap becomes necessary.</li>
<li>Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
File diff suppressed because it is too large Load Diff
+946
View File
@@ -0,0 +1,946 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../pipeline/">
<link rel="next" href="../mphf/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>On-disk storage - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#on-disk-collection-structure" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
On-disk storage
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
On-disk storage
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#collection-parameters" class="md-nav__link">
<span class="md-ellipsis">
Collection parameters
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#count-storage" class="md-nav__link">
<span class="md-ellipsis">
Count storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#query-protocol" class="md-nav__link">
<span class="md-ellipsis">
Query protocol
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#collection-parameters" class="md-nav__link">
<span class="md-ellipsis">
Collection parameters
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#count-storage" class="md-nav__link">
<span class="md-ellipsis">
Count storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#query-protocol" class="md-nav__link">
<span class="md-ellipsis">
Query protocol
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="on-disk-collection-structure">On-disk collection structure</h1>
<p>Collections are too large to hold in RAM (hundreds of genomes, billions of kmers). The collection lives on disk as a directory of memory-mapped files:</p>
<div class="highlight"><pre><span></span><code>collection/
metadata.toml — collection parameters (see below)
part_XXXX/
superkmers.bin.gz — dereplicated super-kmers for this partition (construction artifact)
mphf.bin — minimal perfect hash function for this partition
counts.bin — packed n-bit count array (or 1-bit presence array)
refs.bin — back-references u32 nucleotide offset into unitigs.bin per kmer
unitigs.bin — local de Bruijn unitigs (permanent evidence structure)
overflow.bin — counts exceeding the packed range (optional)
</code></pre></div>
<p><code>superkmers.bin.gz</code> is produced during phase 1 and consumed through phases 24. It can be deleted after phase 5 — it is not needed for querying. The permanent query structure is <code>mphf.bin + counts.bin + refs.bin + unitigs.bin</code>.</p>
<h2 id="collection-parameters">Collection parameters</h2>
<p>Stored in <code>metadata.toml</code>:</p>
<table>
<thead>
<tr>
<th>Parameter</th>
<th>Role</th>
</tr>
</thead>
<tbody>
<tr>
<td>k</td>
<td>kmer length</td>
</tr>
<tr>
<td>m</td>
<td>minimizer length (odd, &lt; k)</td>
</tr>
<tr>
<td>p</td>
<td>partition bits (0 ≤ p ≤ min(14, 2m16))</td>
</tr>
<tr>
<td>mode</td>
<td><code>presence</code> (1 bit/kmer) or <code>count</code> (n bits/kmer)</td>
</tr>
<tr>
<td>n</td>
<td>bits per kmer in count mode (chosen at construction)</td>
</tr>
<tr>
<td>min_count</td>
<td>singleton filtering threshold (0 = keep all)</td>
</tr>
<tr>
<td>hash_fn</td>
<td>hash function identifier</td>
</tr>
<tr>
<td>hash_seed</td>
<td>seed for the hash function</td>
</tr>
</tbody>
</table>
<h2 id="count-storage">Count storage</h2>
<p><strong>refs.bin capacity:</strong> <code>unitigs.bin</code> is a flat 2-bit-packed nucleotide stream with no separators. Each entry in <code>refs.bin</code> is a u32 nucleotide offset pointing to the first base of the kmer. A u32 covers 4 billion nucleotide positions = 1 GB of sequence per partition. In the worst case (all unitigs of length 1 kmer, offsets spaced k apart), this supports 4 billion / k ≈ 130 million kmers per partition at k=31. In the typical case (long unitigs, consecutive kmers at offset +1), the limit approaches 4 billion kmers — well beyond any realistic partition size.</p>
<p><em>Presence mode</em> (coverage ≤ 1x, or when only presence/absence matters):</p>
<ul>
<li><code>counts.bin</code> is a packed 1-bit array — all bits set to 1 for indexed kmers</li>
<li>Singletons are the signal, not filtered</li>
</ul>
<p><em>Count mode</em> (coverage &gt; 1x):</p>
<ul>
<li><code>counts.bin</code> is a packed n-bit array; n chosen at construction from the observed distribution</li>
<li>Value 0: absent sentinel; values 1..2ⁿ−2: direct count; value 2ⁿ−1: overflow</li>
<li>Overflow counts stored in a separate <code>overflow.bin</code> as sorted <code>(index: u32, count: u32)</code> pairs</li>
<li>Empirically (k=31, 15x coverage): n=5 covers 97% of real kmers, n=6 covers 99%</li>
<li>min_count threshold filters low-frequency kmers (errors) before indexing; for ≤1x, min_count=0</li>
</ul>
<h2 id="query-protocol">Query protocol</h2>
<div class="highlight"><pre><span></span><code>query kmer q
→ canonical_minimizer(q) → hash → PART → part_XXXX/
→ MPHF(q) → index i
→ refs[i] = (unitig_id, kmer_offset)
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
→ match : return counts[i]
→ no match: kmer absent
</code></pre></div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+986
View File
@@ -0,0 +1,986 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../../theory/indexing/">
<link rel="next" href="../kmer/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>SuperKmer - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#superkmer-implementation" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
SuperKmer
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
SuperKmer
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
<span class="md-ellipsis">
ASCII encoding and decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kmer-extraction" class="md-nav__link">
<span class="md-ellipsis">
Kmer extraction
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#memory-layout" class="md-nav__link">
<span class="md-ellipsis">
Memory layout
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#ascii-encoding-and-decoding" class="md-nav__link">
<span class="md-ellipsis">
ASCII encoding and decoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reverse-complement" class="md-nav__link">
<span class="md-ellipsis">
Reverse complement
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kmer-extraction" class="md-nav__link">
<span class="md-ellipsis">
Kmer extraction
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="superkmer-implementation">SuperKmer — implementation</h1>
<h2 id="memory-layout">Memory layout</h2>
<p>A super-kmer is stored as a <strong>32-bit header</strong> followed by a <strong>byte-aligned nucleotide sequence</strong> (2 bits/base, nucleotide 0 at the MSB of the first byte, max 256 nt):</p>
<table>
<thead>
<tr>
<th>Field</th>
<th>Bits</th>
<th>Role</th>
</tr>
</thead>
<tbody>
<tr>
<td>COUNT</td>
<td>24</td>
<td>Occurrence count (≤ 16 M)</td>
</tr>
<tr>
<td>SEQL</td>
<td>8</td>
<td>Sequence length in nucleotides (1256)</td>
</tr>
</tbody>
</table>
<p>Bit layout (MSB to LSB): <code>[31:8] COUNT [7:0] SEQL</code></p>
<p>SEQL is stored as a raw <code>u8</code>: values 1255 represent lengths 1255; <strong>0 represents 256</strong> (wrapping convention). The public accessor returns a <code>usize</code> and performs the conversion:</p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">seql</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="mi">0</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="mi">256</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">s</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u32</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">increment</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="mi">1</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">add</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">+=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">;</span><span class="w"> </span><span class="p">}</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">set_count</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="bp">self</span><span class="p">.</span><span class="mi">0</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0xFF</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">n</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">8</span><span class="p">);</span><span class="w"> </span><span class="p">}</span>
</code></pre></div>
<p>The SEQL field is 8 bits, capping the stored sequence at 256 nt. Given the expected length of ~40 nt, this cap is almost never reached; when it is, the super-kmer is split at 256 nt with a k1 overlap, preserving all kmers without duplication.</p>
<p>The sequence is always stored in canonical form (lexicographic minimum of forward and reverse complement), with nucleotide 0 at the MSB of the first byte. The byte array can be hashed directly without any adjustment.</p>
<h2 id="ascii-encoding-and-decoding">ASCII encoding and decoding</h2>
<p>Two lookup tables handle ASCII ↔ 2-bit conversion:</p>
<ul>
<li><strong><code>ENC: [u8; 32]</code></strong> — indexed by <code>b &amp; 0x1F</code> (lower 5 bits of the ASCII byte). Maps A/a→0, C/c→1, G/g→2, T/t and U/u→3; ambiguous bases and unknowns silently map to 0 (A). 32 entries, fits entirely in L1 cache. Upper- and lowercase are handled identically.</li>
<li><strong><code>DEC4: [u32; 256]</code></strong> — maps a packed byte (4 nucleotides) to 4 ASCII characters packed as a big-endian <code>u32</code>. 1 KB total, fits in L1 cache. One lookup per output byte yields 4 decoded characters.</li>
</ul>
<p>Encoding 4 nucleotides into one byte:</p>
<div class="highlight"><pre><span></span><code><span class="n">byte</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c0</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">6</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c1</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">4</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c2</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">ENC</span><span class="p">[</span><span class="n">c3</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x1F</span><span class="p">]</span>
</code></pre></div>
<p>Decoding one byte into 4 ASCII characters:</p>
<div class="highlight"><pre><span></span><code><span class="n">DEC4</span><span class="p">[</span><span class="n">byte</span><span class="p">].</span><span class="n">to_be_bytes</span><span class="p">()</span><span class="w"> </span><span class="c1">// [nuc0, nuc1, nuc2, nuc3] in ASCII</span>
</code></pre></div>
<h2 id="reverse-complement">Reverse complement</h2>
<p>The reverse complement is computed <strong>in place</strong> with zero allocation in two steps.</p>
<p><strong>Step 1 — byte swap with <code>REVCOMP4</code>.</strong> A 256-byte lookup table <code>REVCOMP4</code> maps each byte (4 nucleotides) to its reverse complement. Bytes are swapped from the outside in, applying <code>REVCOMP4</code> to each:</p>
<div class="highlight"><pre><span></span><code><span class="k">const</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">revcomp4</span><span class="p">(</span><span class="n">x</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="o">!</span><span class="n">x</span><span class="p">;</span><span class="w"> </span><span class="c1">// complement all bases</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">4</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">(</span><span class="n">x</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">4</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap nibbles</span>
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">x</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">2</span><span class="p">)</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="p">((</span><span class="n">x</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mh">0x33</span><span class="p">)</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="mi">2</span><span class="p">);</span><span class="w"> </span><span class="c1">// swap 2-bit groups</span>
<span class="w"> </span><span class="n">x</span>
<span class="p">}</span>
</code></pre></div>
<p><code>REVCOMP4</code> is 256 bytes (fits in L1 cache), computed at compile time. No endianness dependency — all operations are pure arithmetic on byte values.</p>
<p><strong>Step 2 — realignment.</strong> After step 1, <code>padding = n × 8 SEQL × 2</code> spurious bits (complements of the original padding A's) appear at the start of the array. They are flushed left using <code>BitSlice&lt;u8, Msb0&gt;::rotate_left(padding)</code> from the <code>bitvec</code> crate, which is SIMD-accelerated. The trailing <code>padding</code> bits are then zeroed:</p>
<div class="highlight"><pre><span></span><code><span class="n">shift</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">n</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">8</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">SEQL</span><span class="w"> </span><span class="o">*</span><span class="w"> </span><span class="mi">2</span><span class="w"> </span><span class="c1">// number of padding bits</span>
<span class="n">bits</span><span class="p">.</span><span class="n">rotate_left</span><span class="p">(</span><span class="n">shift</span><span class="p">)</span>
<span class="n">bits</span><span class="p">[</span><span class="n">len</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">shift</span><span class="o">..</span><span class="p">].</span><span class="n">fill</span><span class="p">(</span><span class="kc">false</span><span class="p">)</span>
</code></pre></div>
<p><code>Msb0</code> ordering makes the bit layout hardware-independent.</p>
<div class="admonition abstract">
<p class="admonition-title">Algorithm — Super-kmer canonisation</p>
<div class="highlight"><pre><span></span><code>procedure SuperKmerCanonical(seq, SEQL):
for i ← 0 to SEQL 1:
fwd ← nucleotide(seq, i)
rev ← complement(nucleotide(seq, SEQL 1 i))
if fwd &lt; rev: return seq -- forward is canonical
if fwd &gt; rev: return SuperKmerRevcomp(seq, SEQL) -- revcomp is canonical
return seq -- palindrome: either orientation valid
</code></pre></div>
</div>
<h2 id="kmer-extraction">Kmer extraction</h2>
<p>A k-mer is extracted from a super-kmer with <code>SuperKmer::kmer(i, k)</code>, which returns a <code>Kmer</code> — a left-aligned <code>u64</code> newtype (see <a href="../kmer/">Kmer implementation</a>):</p>
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">kmer</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">i</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">k</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nb">Result</span><span class="o">&lt;</span><span class="n">Kmer</span><span class="p">,</span><span class="w"> </span><span class="n">KmerError</span><span class="o">&gt;</span>
</code></pre></div>
<p>The bit slice <code>seq[i*2 .. (i+k)*2]</code> (Msb0 order) is loaded as a big-endian <code>u64</code> via <code>bitvec::load_be</code>, then left-shifted to produce the canonical left-aligned layout. One call — no loop, no allocation.</p>
<hr />
<div class="admonition abstract">
<p class="admonition-title">Algorithm — Super-kmer reverse complement</p>
<div class="highlight"><pre><span></span><code>procedure SuperKmerRevcomp(seq, SEQL):
n ← ⌈SEQL / 4⌉ -- number of bytes
shift ← n × 8 SEQL × 2 -- padding bits to flush
-- step 1: swap bytes outside-in, applying REVCOMP4 to each (256-byte L1 table)
lo ← 0 ; hi ← n 1
while lo &lt; hi:
seq[lo], seq[hi] ← REVCOMP4[seq[hi]], REVCOMP4[seq[lo]]
lo ← lo + 1 ; hi ← hi 1
if lo == hi: seq[lo] ← REVCOMP4[seq[lo]]
-- step 2: left-rotate entire bit array by shift, zero trailing bits (SIMD via bitvec)
if shift &gt; 0:
bits.rotate_left(shift)
bits[n×8 shift .. n×8].fill(0)
</code></pre></div>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+853
View File
@@ -0,0 +1,853 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="next" href="theory/kmers/">
<link rel="icon" href="assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>obikmer</title>
<link rel="stylesheet" href="assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL(".",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#obikmer" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Home
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Home
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="." class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Home
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#constraints" class="md-nav__link">
<span class="md-ellipsis">
Constraints
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#priority-operations" class="md-nav__link">
<span class="md-ellipsis">
Priority operations
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="theory/kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#constraints" class="md-nav__link">
<span class="md-ellipsis">
Constraints
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#priority-operations" class="md-nav__link">
<span class="md-ellipsis">
Priority operations
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="obikmer">obikmer</h1>
<p><code>obikmer</code> is a Rust tool for manipulation, counting, indexing, and set operations on DNA sequences represented as kmer sets.</p>
<h2 id="constraints">Constraints</h2>
<ul>
<li>Target scale: metagenomic data, tens of Gbases, billions of kmers</li>
<li>Maximum efficiency in computation, memory, and disk usage</li>
<li>Input formats: FASTA, FASTQ, gzip, streaming stdin</li>
</ul>
<h2 id="priority-operations">Priority operations</h2>
<ul>
<li>Kmer counting (frequencies)</li>
<li>Fast search / query</li>
<li>Set operations: union, intersection, difference</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": ".", "features": [], "search": "assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+243
View File
@@ -0,0 +1,243 @@
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
%% Saved with string encoding Unicode (UTF-8)
@article{Zheng2020-ji,
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
string, with the guarantee that similar set of k-mers will be
chosen on similar strings. It is parameterized by the k-mer
length k, a window length w and an order on the k-mers.
Minimizers are used in a large number of softwares and pipelines
to improve computation efficiency and decrease memory usage.
Despite the method's popularity, many theoretical questions
regarding its performance remain open. The core metric for
measuring performance of a minimizer is the density, which
measures the sparsity of sampled k-mers. The theoretical optimal
density for a minimizer is 1/w, provably not achievable in
general. For given k and w, little is known about asymptotically
optimal minimizers, that is minimizers with density O(1/w).
RESULTS: We derive a necessary and sufficient condition for
existence of asymptotically optimal minimizers. We also provide a
randomized algorithm, called the Miniception, to design
minimizers with the best theoretical guarantee to date on density
in practical scenarios. Constructing and using the Miniception is
as easy as constructing and using a random minimizer, which
allows the design of efficient minimizers that scale to the
values of k and w used in current bioinformatics software
programs. AVAILABILITY AND IMPLEMENTATION: Reference
implementation of the Miniception and the codes for analysis can
be found at https://github.com/kingsford-group/miniception.
SUPPLEMENTARY INFORMATION: Supplementary data are available at
Bioinformatics online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btaa472},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl_1},
pages = {i119--i127},
pmc = {PMC8248892},
pmid = 32657376,
publisher = {Oxford University Press (OUP)},
title = {Improved design and analysis of practical minimizers},
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
volume = 36,
year = 2020,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
@article{Zheng2021-cc,
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
from genomic sequences that unconditionally preserve sufficiently
long matches between sequences. Well-established methods to
construct efficient minimizers focus on sampling fewer k-mers on
a random sequence and use universal hitting sets (sets of k-mers
that appear frequently enough) to upper bound the sketch size. In
contrast, the problem of sequence-specific minimizers, which is
to construct efficient minimizers to sample fewer k-mers on a
specific sequence such as the reference genome, is less studied.
Currently, the theoretical understanding of this problem is
lacking, and existing methods do not specialize well to sketch
specific sequences. RESULTS: We propose the concept of polar
sets, complementary to the existing idea of universal hitting
sets. Polar sets are k-mer sets that are spread out enough on the
reference, and provably specialize well to specific sequences.
Link energy measures how well spread out a polar set is, and with
it, the sketch size can be bounded from above and below in a
theoretically sound way. This allows for direct optimization of
sketch size. We propose efficient heuristics to construct polar
sets, and via experiments on the human reference genome, show
their practical superiority in designing efficient
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
reference implementation and code for analyses under an
open-source license are at
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
INFORMATION: Supplementary data are available at Bioinformatics
online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btab313},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl\_1},
pages = {i187--i195},
pmc = {PMC8686682},
pmid = 34252928,
publisher = {Oxford University Press (OUP)},
title = {Sequence-specific minimizers via polar sets},
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
volume = 37,
year = 2021,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
@article{Pan2024-hb,
abstract = {MOTIVATION: The minimizer concept is a data structure for
sequence sketching. The standard canonical minimizer selects a
subset of k-mers from the given DNA sequence by comparing the
forward and reverse k-mers in a window simultaneously according
to a predefined selection scheme. It is widely employed by
sequence analysis such as read mapping and assembly. k-mer
density, k-mer repetitiveness (e.g. k-mer bias), and
computational efficiency are three critical measurements for
minimizer selection schemes. However, there exist trade-offs
between kinds of minimizer variants. Generic, effective, and
efficient are always the requirements for high-performance
minimizer algorithms. RESULTS: We propose a simple minimizer
operator as a refinement of the standard canonical minimizer. It
takes only a few operations to compute. However, it can improve
the k-mer repetitiveness, especially for the lexicographic order.
It applies to other selection schemes of total orders (e.g.
random orders). Moreover, it is computationally efficient and the
density is close to that of the standard minimizer. The refined
minimizer may benefit high-performance applications like binning
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
code of the benchmark in this work is available at the github
repository https://github.com/xp3i4/mini\_benchmark.},
author = {Pan, Chenxu and Reinert, Knut},
doi = {10.1093/bioinformatics/btae045},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = feb,
number = 2,
pmc = {PMC10868324},
pmid = 38269626,
publisher = {Oxford University Press (OUP)},
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
volume = 40,
year = 2024,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
@article{Kille2023-px,
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
a convenient proxy for sequence identity. By avoiding expensive
base-level alignments and comparing reduced sequence
representations, tools such as MashMap can scale to massive
numbers of pairwise comparisons while still providing useful
similarity estimates. However, due to their reliance on minimizer
winnowing, previous versions of MashMap were shown to be biased
and inconsistent estimators of Jaccard similarity. This directly
impacts downstream tools that rely on the accuracy of these
estimates. RESULTS: To address this, we propose the minmer
winnowing scheme, which generalizes the minimizer scheme by use
of a rolling minhash with multiple sampled k-mers per window. We
show both theoretically and empirically that minmers yield an
unbiased estimator of local Jaccard similarity, and we implement
this scheme in an updated version of MashMap. The minmer-based
implementation is over 10 times faster than the minimizer-based
version under the default ANI threshold, making it well-suited
for large-scale comparative genomics applications. AVAILABILITY
AND IMPLEMENTATION: MashMap3 is available at
https://github.com/marbl/MashMap.},
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
doi = {10.1093/bioinformatics/btad512},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = sep,
number = 9,
pmc = {PMC10505501},
pmid = 37603771,
publisher = {Oxford University Press (OUP)},
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
volume = 39,
year = 2023,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
@incollection{Golan2025-xf,
address = {Cham},
author = {Golan, Shay and Shur, Arseny M},
booktitle = {Lecture Notes in Computer Science},
doi = {10.1007/978-3-031-82670-2\_25},
isbn = {9783031826696,9783031826702},
issn = {0302-9743,1611-3349},
language = {en},
pages = {347--360},
publisher = {Springer Nature Switzerland},
series = {Lecture Notes in Computer Science},
title = {Expected density of random minimizers},
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
year = 2025,
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
@article{Mohamadi2017-ok,
abstract = {Motivation: Many bioinformatics algorithms are designed for the
analysis of sequences of some uniform length, conventionally
referred to as k -mers. These include de Bruijn graph assembly
methods and sequence alignment tools. An efficient algorithm to
enumerate the number of unique k -mers, or even better, to build
a histogram of k -mer frequencies would be desirable for these
tools and their downstream analysis pipelines. Among other
applications, estimated frequencies can be used to predict genome
sizes, measure sequencing error rates, and tune runtime
parameters for analysis tools. However, calculating a k -mer
histogram from large volumes of sequencing data is a challenging
task. Results: Here, we present ntCard, a streaming algorithm for
estimating the frequencies of k -mers in genomics datasets. At
its core, ntCard uses the ntHash algorithm to efficiently compute
hash values for streamed sequences. It then samples the
calculated hash values to build a reduced representation
multiplicity table describing the sample distribution. Finally,
it uses a statistical model to reconstruct the population
distribution from the sample distribution. We have compared the
performance of ntCard and other cardinality estimation
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
in size, where the first two representing whole genome shotgun
sequencing experiments on the human genome and the last one on
the white spruce genome. Results show ntCard estimates k -mer
coverage frequencies >15× faster than the state-of-the-art
algorithms, using similar amount of memory, and with higher
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
potentially enabling technology for large-scale genomics
applications. Availability and Implementation: ntCard is written
in C ++ and is released under the GPL license. It is freely
available at https://github.com/bcgsc/ntCard. Contact:
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
Supplementary data are available at Bioinformatics online.},
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
date-modified = {2026-04-18 08:19:36 +0200},
doi = {10.1093/bioinformatics/btw832},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = may,
number = 9,
pages = {1324--1330},
pmc = {PMC5408799},
pmid = 28453674,
publisher = {Oxford University Press (OUP)},
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
volume = 33,
year = 2017,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}
+3
View File
@@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
</urlset>
Binary file not shown.
+890
View File
@@ -0,0 +1,890 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../kmers/">
<link rel="next" href="../entropy/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>DNA encoding - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#dna-encoding" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
DNA encoding
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
DNA encoding
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#2-bit-nucleotide-encoding" class="md-nav__link">
<span class="md-ellipsis">
2-bit nucleotide encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kmer-encoding" class="md-nav__link">
<span class="md-ellipsis">
Kmer encoding
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#2-bit-nucleotide-encoding" class="md-nav__link">
<span class="md-ellipsis">
2-bit nucleotide encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kmer-encoding" class="md-nav__link">
<span class="md-ellipsis">
Kmer encoding
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="dna-encoding">DNA encoding</h1>
<h2 id="2-bit-nucleotide-encoding">2-bit nucleotide encoding</h2>
<p>All nucleotides are encoded on 2 bits, MSB-first within each word. Nucleotides are numbered 0-based from the 5 end across all sequence types:</p>
<table>
<thead>
<tr>
<th>Base</th>
<th>Encoding</th>
</tr>
</thead>
<tbody>
<tr>
<td>A</td>
<td><code>00</code></td>
</tr>
<tr>
<td>C</td>
<td><code>01</code></td>
</tr>
<tr>
<td>G</td>
<td><code>10</code></td>
</tr>
<tr>
<td>T</td>
<td><code>11</code></td>
</tr>
</tbody>
</table>
<p>The Watson-Crick complement of any base is its bitwise NOT on 2 bits: <code>complement(base) = ~base &amp; 0b11</code>.</p>
<h2 id="kmer-encoding">Kmer encoding</h2>
<p>A kmer fits in a single <code>u64</code>. Nucleotide 0 occupies bits 6362, nucleotide i occupies bits 632i and 622i, and the low 642k bits are zero. Extraction of nucleotide i (0 ≤ i &lt; k): <code>(kmer &gt;&gt; (62 - 2*i)) &amp; 0b11</code>.</p>
<p>Reverse complement is computed via a <strong>16-bit lookup table</strong> (65 536 entries × 2 bytes = 128 KB, fits in L2 cache) storing the reverse-complement of every 8-base chunk.</p>
<div class="admonition abstract">
<p class="admonition-title">Algorithm — Kmer reverse complement</p>
<div class="highlight"><pre><span></span><code>procedure KmerRevcomp(kmer, k):
raw ← TABLE16[kmer &amp; 0xFFFF] &lt;&lt; 48
| TABLE16[(kmer &gt;&gt; 16) &amp; 0xFFFF] &lt;&lt; 32
| TABLE16[(kmer &gt;&gt; 32) &amp; 0xFFFF] &lt;&lt; 16
| TABLE16[(kmer &gt;&gt; 48) &amp; 0xFFFF]
return raw &lt;&lt; (64 - 2*k)
</code></pre></div>
</div>
<p>The <strong>canonical form</strong> is the lexicographic minimum of the kmer and its reverse complement:</p>
<div class="highlight"><pre><span></span><code>canonical(kmer) = min(kmer, revcomp(kmer))
</code></pre></div>
<p>This halves the kmer space and ensures strand-independent counting.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+990
View File
@@ -0,0 +1,990 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../encoding/">
<link rel="next" href="../indexing/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Entropy filter - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#kmer-entropy-filter" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Entropy filter
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Entropy filter
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#sub-word-frequencies" class="md-nav__link">
<span class="md-ellipsis">
Sub-word frequencies
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#corrected-shannon-entropy" class="md-nav__link">
<span class="md-ellipsis">
Corrected Shannon entropy
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
<span class="md-ellipsis">
Maximum entropy correction for small samples
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#normalized-entropy" class="md-nav__link">
<span class="md-ellipsis">
Normalized entropy
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#final-score" class="md-nav__link">
<span class="md-ellipsis">
Final score
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
<span class="md-ellipsis">
Interpretation as an effective number of classes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#properties" class="md-nav__link">
<span class="md-ellipsis">
Properties
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#sub-word-frequencies" class="md-nav__link">
<span class="md-ellipsis">
Sub-word frequencies
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#corrected-shannon-entropy" class="md-nav__link">
<span class="md-ellipsis">
Corrected Shannon entropy
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
<span class="md-ellipsis">
Maximum entropy correction for small samples
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#normalized-entropy" class="md-nav__link">
<span class="md-ellipsis">
Normalized entropy
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#final-score" class="md-nav__link">
<span class="md-ellipsis">
Final score
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
<span class="md-ellipsis">
Interpretation as an effective number of classes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#properties" class="md-nav__link">
<span class="md-ellipsis">
Properties
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="kmer-entropy-filter">Kmer entropy filter</h1>
<p>Low-complexity kmers (polyA, polyT, tandem repeats) are detected and excluded during phase 1. The filter computes a <strong>normalized Shannon entropy</strong> over sub-words of multiple sizes, corrected for two sources of bias: the small number of observations within a single kmer, and the unequal sizes of circular equivalence classes.</p>
<h2 id="sub-word-frequencies">Sub-word frequencies</h2>
<p>For a kmer of length k and a sub-word size ws (1 ≤ ws ≤ ws_max, typically ws_max = 6), extract the <span class="arithmatex">\(n_{\text{words}} = k - ws + 1\)</span> overlapping sub-words by sliding a window of length ws:</p>
<div class="arithmatex">\[w_i = \text{kmer}[i \mathinner{..} i+ws-1], \quad i = 0, \ldots, n_{\text{words}}-1\]</div>
<p>Each sub-word is mapped to its <strong>circular canonical form</strong>: the lexicographic minimum among all cyclic rotations of the word <strong>and all cyclic rotations of its reverse complement</strong>. This extended equivalence relation ensures that entropy(K) = entropy(revcomp(K)) — the filter is strand-symmetric. Let <span class="arithmatex">\(s_j\)</span> be the size of equivalence class <span class="arithmatex">\(j\)</span> (number of distinct raw words mapping to canonical form <span class="arithmatex">\(j\)</span>), and <span class="arithmatex">\(f_j\)</span> the count of canonical form <span class="arithmatex">\(j\)</span> among the <span class="arithmatex">\(n_{\text{words}}\)</span> sub-words (<span class="arithmatex">\(\sum_j f_j = n_{\text{words}}\)</span>).</p>
<h2 id="corrected-shannon-entropy">Corrected Shannon entropy</h2>
<p>The circular equivalence classes have unequal sizes: under a uniform distribution over all <span class="arithmatex">\(4^{ws}\)</span> raw words, class <span class="arithmatex">\(j\)</span> is visited with probability <span class="arithmatex">\(s_j / 4^{ws}\)</span>, not <span class="arithmatex">\(1/n_a\)</span>. Computing entropy directly over canonical classes therefore underestimates the entropy of a random sequence.</p>
<p>The correction "unfolds" each canonical class back to its member raw words, redistributing each observation of class <span class="arithmatex">\(j\)</span> equally among its <span class="arithmatex">\(s_j\)</span> members:</p>
<div class="arithmatex">\[H_{\text{corr}} = \log(n_{\text{words}}) - \frac{1}{n_{\text{words}}} \sum_j f_j \log f_j + \frac{1}{n_{\text{words}}} \sum_j f_j \log s_j\]</div>
<p>The last term is the correction for unequal class sizes. For a uniformly random sequence (<span class="arithmatex">\(f_j \approx n_{\text{words}} \cdot s_j / 4^{ws}\)</span>), this gives <span class="arithmatex">\(H_{\text{corr}} \approx \log(4^{ws}) = 2 \cdot ws \cdot \log 2\)</span>, the maximum entropy over raw words.</p>
<h2 id="maximum-entropy-correction-for-small-samples">Maximum entropy correction for small samples</h2>
<p>With only <span class="arithmatex">\(n_{\text{words}}\)</span> observations over <span class="arithmatex">\(4^{ws}\)</span> possible raw words, the achievable maximum entropy is bounded by the most uniform integer distribution over <span class="arithmatex">\(4^{ws}\)</span> categories.</p>
<p>Let <span class="arithmatex">\(c = \lfloor n_{\text{words}} / 4^{ws} \rfloor\)</span> and <span class="arithmatex">\(r = n_{\text{words}} \bmod 4^{ws}\)</span>. The most uniform integer distribution assigns frequency <span class="arithmatex">\(c+1\)</span> to <span class="arithmatex">\(r\)</span> categories and <span class="arithmatex">\(c\)</span> to the remaining <span class="arithmatex">\(4^{ws} - r\)</span>, with the convention <span class="arithmatex">\(0 \log 0 = 0\)</span>:</p>
<div class="arithmatex">\[H_{\max} = -\left[(4^{ws} - r)\,\frac{c}{n_{\text{words}}}\log\frac{c}{n_{\text{words}}} + r\,\frac{c+1}{n_{\text{words}}}\log\frac{c+1}{n_{\text{words}}}\right]\]</div>
<p>When <span class="arithmatex">\(n_{\text{words}} &lt; 4^{ws}\)</span>: <span class="arithmatex">\(c=0\)</span>, <span class="arithmatex">\(r=n_{\text{words}}\)</span>, and the formula reduces to <span class="arithmatex">\(H_{\max} = \log(n_{\text{words}})\)</span> — a single unified expression covers both regimes. A truly random sequence achieves <span class="arithmatex">\(H_{\text{corr}} \approx H_{\max}\)</span>.</p>
<h2 id="normalized-entropy">Normalized entropy</h2>
<div class="arithmatex">\[\hat{H}(ws) = \frac{H_{\text{corr}}}{H_{\max}} \in [0, 1]\]</div>
<h2 id="final-score">Final score</h2>
<p>The filter computes <span class="arithmatex">\(\hat{H}(ws)\)</span> for each word size ws from 1 to ws_max and returns the <strong>minimum</strong>:</p>
<div class="arithmatex">\[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]</div>
<p>A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if <span class="arithmatex">\(\text{entropy}(kmer) \leq \theta\)</span>, where <span class="arithmatex">\(\theta\)</span> is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.</p>
<h2 id="interpretation-as-an-effective-number-of-classes">Interpretation as an effective number of classes</h2>
<p><span class="arithmatex">\(H_{\text{corr}}\)</span> is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: <span class="arithmatex">\(N_{\text{eff}} = e^{H_{\text{corr}}}\)</span> is the number of equiprobable classes that would yield the same entropy.</p>
<p>For the normalised score <span class="arithmatex">\(\hat{H}\)</span>, dividing by <span class="arithmatex">\(H_{\text{max}}\)</span> changes the logarithm base:</p>
<div class="arithmatex">\[\hat{H} = \frac{\log N_{\text{eff}}}{\log N_{\text{max}}} = \log_{N_{\text{max}}} N_{\text{eff}} \quad \Longleftrightarrow \quad N_{\text{eff}} = N_{\text{max}}^{\,\hat{H}}\]</div>
<p>The property is preserved: <span class="arithmatex">\(\hat{H}\)</span> is the logarithm (in base <span class="arithmatex">\(N_{\text{max}}\)</span>) of the effective number of equi-represented classes.</p>
<p>In the large-sample limit (<span class="arithmatex">\(n_{\text{words}} \gg 4^{ws}\)</span>), <span class="arithmatex">\(N_{\text{max}} \approx 4^{ws}\)</span>, giving:</p>
<div class="arithmatex">\[N_{\text{eff}} \approx 4^{ws \cdot \hat{H}}\]</div>
<p>This has a clean interpretation: <span class="arithmatex">\(ws \cdot \hat{H}\)</span> is the <strong>effective word length</strong> (in bases) of a perfectly uniform distribution that would produce the same entropy. At <span class="arithmatex">\(\hat{H} = 1\)</span> the full space of <span class="arithmatex">\(4^{ws}\)</span> words is used; at <span class="arithmatex">\(\hat{H} = 0.5\)</span> with ws=2, only <span class="arithmatex">\(4^1 = 4\)</span> effective classes out of 16 are occupied.</p>
<p>In our actual regime, <span class="arithmatex">\(n_{\text{words}}\)</span> is small and <span class="arithmatex">\(4^{ws}\)</span> can exceed <span class="arithmatex">\(n_{\text{words}}\)</span>, so <span class="arithmatex">\(H_{\text{max}} &lt; \log(4^{ws})\)</span> due to the small-sample correction. The exact effective count is <span class="arithmatex">\(N_{\text{max}}^{\hat{H}}\)</span>, not <span class="arithmatex">\(4^{ws \cdot \hat{H}}\)</span>.</p>
<h2 id="properties">Properties</h2>
<p>The entropy score is a function of the kmer sequence alone — it does not depend on the surrounding context or on the position within any genome. Two consequences:</p>
<ul>
<li><strong>Orientation invariance</strong>: <span class="arithmatex">\(\text{entropy}(K) = \text{entropy}(\text{revcomp}(K))\)</span>, guaranteed by the strand-symmetric canonical form.</li>
<li><strong>Context independence</strong>: the same kmer is always rejected or always kept, regardless of which genome it occurs in, where in that genome it appears, or which strand is considered. The filter defines a fixed partition of the kmer space into low-complexity and valid kmers.</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+910
View File
@@ -0,0 +1,910 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../entropy/">
<link rel="next" href="../../implementation/superkmer/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Partitioning architecture - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#partitioning-and-indexing-architecture" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Partitioning architecture
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Partitioning architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#why-hashing-is-necessary" class="md-nav__link">
<span class="md-ellipsis">
Why hashing is necessary
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parameter-choices" class="md-nav__link">
<span class="md-ellipsis">
Parameter choices
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#why-hashing-is-necessary" class="md-nav__link">
<span class="md-ellipsis">
Why hashing is necessary
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#parameter-choices" class="md-nav__link">
<span class="md-ellipsis">
Parameter choices
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="partitioning-and-indexing-architecture">Partitioning and indexing architecture</h1>
<p>The canonical minimizer of a super-kmer is hashed to produce a <strong>p-bit routing value</strong> (p is a collection-level parameter):</p>
<div class="highlight"><pre><span></span><code>canonical minimizer → hash(minimizer) → p-bit value → PART → partition directory
</code></pre></div>
<p>PART is computed once at phase 1 to open the correct partition file, then discarded. It is recomputed on the fly at query time. It is never stored in the super-kmer header.</p>
<p>Each partition holds one MPHF instance (phase 6) that indexes kmers as plain u64 values — the minimizer plays no role inside the partition.</p>
<h2 id="why-hashing-is-necessary">Why hashing is necessary</h2>
<p>The canonical minimizer is an m-mer (m ∈ {9, 11, 13, 15}), encoded in 2m bits (18 to 30 bits). Its distribution over the <span class="arithmatex">\(4^m\)</span> possible values is <strong>not uniform</strong>: because the minimizer is the lexicographic minimum of a window of m-mers, small values are systematically over-represented (Golan &amp; Shur 2025; Kille <em>et al.</em> 2023; Pan &amp; Reinert 2024; Zheng <em>et al.</em> 2020, 2021)<sup id="fnref:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">1</a></sup> <sup id="fnref:Zheng2021-cc"><a class="footnote-ref" href="#fn:Zheng2021-cc">2</a></sup> <sup id="fnref:Pan2024-hb"><a class="footnote-ref" href="#fn:Pan2024-hb">3</a></sup> <sup id="fnref:Kille2023-px"><a class="footnote-ref" href="#fn:Kille2023-px">4</a></sup> <sup id="fnref:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">5</a></sup>. Routing directly by the raw minimizer value would produce severely unbalanced partitions.</p>
<p>A hash function with good avalanche properties redistributes this skewed distribution uniformly over the <span class="arithmatex">\(2^p\)</span> partition slots. The key reason this works well is the <strong>entropy gap</strong>: p is chosen to be much smaller than 2m, so the hash compresses many distinct minimizer values into each partition slot. Even under strong bias in the minimizer distribution, as long as its effective entropy exceeds p bits — which holds comfortably since the set of distinct minimizers in any real dataset is far larger than <span class="arithmatex">\(2^p\)</span> — the load imbalance across partitions is negligible.</p>
<h2 id="parameter-choices">Parameter choices</h2>
<table>
<thead>
<tr>
<th>m</th>
<th>2m (bits)</th>
<th>Typical p</th>
<th>Partitions</th>
</tr>
</thead>
<tbody>
<tr>
<td>9</td>
<td>18</td>
<td>68</td>
<td>64256</td>
</tr>
<tr>
<td>11</td>
<td>22</td>
<td>810</td>
<td>2561 024</td>
</tr>
<tr>
<td>13</td>
<td>26</td>
<td>1012</td>
<td>1 0244 096</td>
</tr>
<tr>
<td>15</td>
<td>30</td>
<td>1014</td>
<td>1 02416 384</td>
</tr>
</tbody>
</table>
<p>The hard constraint is p ≤ 2m: one cannot extract more bits of uniform randomness from a source than it contains. In practice p is chosen well below 2m, leaving a large entropy margin that absorbs the minimizer bias. For k=31, m=13, p=10: 1 024 partitions with comfortable balance.</p>
<div class="footnote">
<hr />
<ol>
<li id="fn:Zheng2020-ji">
<p>Zheng, H., Kingsford, C. &amp; Marçais, G. (2020). <a href="https://doi.org/10.1093/bioinformatics/btaa472">Improved design and analysis of practical minimizers</a>. <em>Bioinformatics (Oxford, England)</em>, 36, i119--i127.&#160;<a class="footnote-backref" href="#fnref:Zheng2020-ji" title="Jump back to footnote 1 in the text">&#8617;</a></p>
</li>
<li id="fn:Zheng2021-cc">
<p>Zheng, H., Kingsford, C. &amp; Marçais, G. (2021). <a href="https://doi.org/10.1093/bioinformatics/btab313">Sequence-specific minimizers via polar sets</a>. <em>Bioinformatics (Oxford, England)</em>, 37, i187--i195.&#160;<a class="footnote-backref" href="#fnref:Zheng2021-cc" title="Jump back to footnote 2 in the text">&#8617;</a></p>
</li>
<li id="fn:Pan2024-hb">
<p>Pan, C. &amp; Reinert, K. (2024). <a href="https://doi.org/10.1093/bioinformatics/btae045">A simple refined DNA minimizer operator enables 2-fold faster computation</a>. <em>Bioinformatics (Oxford, England)</em>, 40.&#160;<a class="footnote-backref" href="#fnref:Pan2024-hb" title="Jump back to footnote 3 in the text">&#8617;</a></p>
</li>
<li id="fn:Kille2023-px">
<p>Kille, B., Garrison, E., Treangen, T.J. &amp; Phillippy, A.M. (2023). <a href="https://doi.org/10.1093/bioinformatics/btad512">Minmers are a generalization of minimizers that enable unbiased local jaccard estimation</a>. <em>Bioinformatics (Oxford, England)</em>, 39.&#160;<a class="footnote-backref" href="#fnref:Kille2023-px" title="Jump back to footnote 4 in the text">&#8617;</a></p>
</li>
<li id="fn:Golan2025-xf">
<p>Golan, S. &amp; Shur, A.M. (2025). <a href="https://doi.org/10.1007/978-3-031-82670-2\_25">Expected density of random minimizers</a>. In: <em>Lecture notes in computer science</em>, Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360.&#160;<a class="footnote-backref" href="#fnref:Golan2025-xf" title="Jump back to footnote 5 in the text">&#8617;</a></p>
</li>
</ol>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+931
View File
@@ -0,0 +1,931 @@
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../..">
<link rel="next" href="../encoding/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Kmers and super-kmers - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#kmers-and-super-kmers" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#kmers" class="md-nav__link">
<span class="md-ellipsis">
Kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Super-kmers
</span>
</a>
<nav class="md-nav" aria-label="Super-kmers">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#canonical-super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Canonical super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
<span class="md-ellipsis">
Expected length of a super-kmer
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#kmers" class="md-nav__link">
<span class="md-ellipsis">
Kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Super-kmers
</span>
</a>
<nav class="md-nav" aria-label="Super-kmers">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#canonical-super-kmers" class="md-nav__link">
<span class="md-ellipsis">
Canonical super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#expected-length-of-a-super-kmer" class="md-nav__link">
<span class="md-ellipsis">
Expected length of a super-kmer
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="kmers-and-super-kmers">Kmers and super-kmers</h1>
<h2 id="kmers">Kmers</h2>
<p>A <strong>kmer</strong> is a DNA subsequence of fixed length k. Two constraints govern the choice of k:</p>
<ul>
<li><strong>k ∈ [11, 31]</strong>: the range ensures the kmer is long enough to be specific and short enough to fit in a single machine word.</li>
<li><strong>k is odd</strong>: an odd-length sequence cannot equal its own reverse complement (no palindromes). This guarantees that the canonical form <code>min(kmer, revcomp(kmer))</code> is always strictly defined — the two orientations are always distinct — which is required for strand-independent counting.</li>
</ul>
<h2 id="super-kmers">Super-kmers</h2>
<p>A <strong>super-kmer</strong> is a maximal run of consecutive kmers from a DNA read, each overlapping the next by k1 nucleotides. Each kmer of the run carries the same <strong>canonical minimizer</strong>. The <strong>canonical minimizer</strong> of a kmer is the smallest value of <code>min(m-mer, revcomp(m-mer))</code> over all m-mers within the kmer (m &lt; k, m odd).</p>
<h3 id="canonical-super-kmers">Canonical super-kmers</h3>
<p>A <strong>canonical super-kmer</strong> is the lexicographic minimum of a super-kmer and its reverse complement:</p>
<div class="highlight"><pre><span></span><code>canonical(super-kmer) = min(super-kmer, revcomp(super-kmer))
</code></pre></div>
<p>When a read and its reverse-complement are both sequenced, they produce super-kmers that are reverse complements of each other. Both map to the same canonical form: the same genomic region is represented by a single canonical super-kmer regardless of which strand was read.</p>
<h3 id="expected-length-of-a-super-kmer">Expected length of a super-kmer</h3>
<p>For a random minimizer of length m over k-mers of length k, the density of minimizer positions is approximately 2/(km+2) (Golan &amp; Shur 2025; Zheng <em>et al.</em> 2020)<sup id="fnref:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> <sup id="fnref:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>, so the expected number of consecutive k-mers per super-kmer is (km+2)/2. A run of n k-mers spans n + k 1 nucleotides, giving:</p>
<div class="arithmatex">\[L_{\text{nt}} = \frac{k-m+2}{2} + k - 1\]</div>
<p>For k=31, m=13: expected ≈ 40 nt. In practice super-kmers rarely exceed a few dozen nucleotides.<sup id="fnref:superkmer_length"><a class="footnote-ref" href="#fn:superkmer_length">1</a></sup></p>
<div class="footnote">
<hr />
<ol>
<li id="fn:superkmer_length">
<p>The expected length formula and the density approximation 2/(km+2) should be verified against the values reported in (Zheng <em>et al.</em> 2020)<sup id="fnref2:Zheng2020-ji"><a class="footnote-ref" href="#fn:Zheng2020-ji">2</a></sup> and (Golan &amp; Shur 2025)<sup id="fnref2:Golan2025-xf"><a class="footnote-ref" href="#fn:Golan2025-xf">3</a></sup>.&#160;<a class="footnote-backref" href="#fnref:superkmer_length" title="Jump back to footnote 1 in the text">&#8617;</a></p>
</li>
<li id="fn:Zheng2020-ji">
<p>Zheng, H., Kingsford, C. &amp; Marçais, G. (2020). <a href="https://doi.org/10.1093/bioinformatics/btaa472">Improved design and analysis of practical minimizers</a>. <em>Bioinformatics (Oxford, England)</em>, 36, i119--i127.&#160;<a class="footnote-backref" href="#fnref:Zheng2020-ji" title="Jump back to footnote 2 in the text">&#8617;</a><a class="footnote-backref" href="#fnref2:Zheng2020-ji" title="Jump back to footnote 2 in the text">&#8617;</a></p>
</li>
<li id="fn:Golan2025-xf">
<p>Golan, S. &amp; Shur, A.M. (2025). <a href="https://doi.org/10.1007/978-3-031-82670-2\_25">Expected density of random minimizers</a>. In: <em>Lecture notes in computer science</em>, Lecture notes in computer science. Springer Nature Switzerland, Cham, pp. 347--360.&#160;<a class="footnote-backref" href="#fnref:Golan2025-xf" title="Jump back to footnote 3 in the text">&#8617;</a><a class="footnote-backref" href="#fnref2:Golan2025-xf" title="Jump back to footnote 3 in the text">&#8617;</a></p>
</li>
</ol>
</div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>
+377
View File
@@ -0,0 +1,377 @@
<?xml version="1.0" encoding="utf-8"?>
<style xmlns="http://purl.org/net/xbiblio/csl" class="in-text" version="1.0" demote-non-dropping-particle="sort-only" initialize-with-hyphen="false" page-range-format="minimal">
<info>
<title>Vancouver</title>
<id>http://www.zotero.org/styles/vancouver</id>
<link href="http://www.zotero.org/styles/vancouver" rel="self"/>
<link href="http://www.nlm.nih.gov/bsd/uniform_requirements.html" rel="documentation"/>
<author>
<name>Michael Berkowitz</name>
<email>mberkowi@gmu.edu</email>
</author>
<contributor>
<name>Sean Takats</name>
<email>stakats@gmu.edu</email>
</contributor>
<contributor>
<name>Sebastian Karcher</name>
</contributor>
<category citation-format="numeric"/>
<category field="generic-base"/>
<category field="medicine"/>
<summary>Vancouver style as outlined by International Committee of Medical Journal Editors Uniform Requirements for Manuscripts Submitted to Biomedical Journals: Sample References</summary>
<updated>2025-05-17T20:55:38-04:00</updated>
<rights license="http://creativecommons.org/licenses/by-sa/3.0/">This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 License</rights>
</info>
<locale xml:lang="en">
<date form="text" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<terms>
<term name="collection-editor" form="long">
<single>editor</single>
<multiple>editors</multiple>
</term>
<term name="presented at">presented at</term>
<term name="available at">available from</term>
<term name="section" form="short">sect.</term>
</terms>
</locale>
<locale xml:lang="fr">
<date form="text" delimiter=" ">
<date-part name="day"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="year"/>
</date>
</locale>
<macro name="author">
<names variable="author">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
<substitute>
<text macro="webpage-title"/>
<names variable="editor"/>
</substitute>
</names>
</macro>
<macro name="editor">
<names variable="editor" suffix=".">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
</names>
</macro>
<macro name="chapter-marker">
<choose>
<if type="chapter paper-conference entry-dictionary entry-encyclopedia" match="any">
<text term="in" text-case="capitalize-first"/>
</if>
</choose>
</macro>
<macro name="webpage-title">
<!--If a webpage has a container, we're assuming the citation is "part of a website" as per ch. 25 Citing Medicine https://www.ncbi.nlm.nih.gov/books/NBK7274/?report=reader -->
<choose>
<if type="webpage" variable="container-title" match="all">
<group delimiter=" ">
<text variable="container-title"/>
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
</group>
</if>
</choose>
</macro>
<macro name="publisher">
<choose>
<!--discard publisher info for articles-->
<if type="article-journal article-magazine article-newspaper" match="none">
<group delimiter=": " suffix=";">
<choose>
<if type="thesis">
<text variable="publisher-place" prefix="[" suffix="]"/>
</if>
<else-if type="speech"/>
<else>
<text variable="publisher-place"/>
</else>
</choose>
<text variable="publisher"/>
</group>
</if>
</choose>
</macro>
<macro name="access">
<choose>
<if variable="URL">
<group delimiter=": ">
<text term="available at" text-case="capitalize-first"/>
<text variable="URL"/>
</group>
</if>
</choose>
</macro>
<macro name="accessed-date">
<choose>
<if variable="URL">
<group prefix="[" suffix="]" delimiter=" ">
<text term="cited" text-case="lowercase"/>
<date variable="accessed" form="text"/>
</group>
</if>
</choose>
</macro>
<macro name="container-title">
<choose>
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="any">
<group suffix="." delimiter=" ">
<choose>
<if type="article-journal review review-book" match="any">
<text variable="container-title" form="short" strip-periods="true"/>
</if>
<else>
<text variable="container-title" strip-periods="true"/>
</else>
</choose>
<choose>
<if variable="URL">
<text term="internet" prefix="[" suffix="]" text-case="capitalize-first"/>
</if>
</choose>
</group>
<text macro="edition" prefix=" "/>
</if>
<!--add event-name and event-place once they become available-->
<else-if type="bill legislation" match="any">
<group delimiter=", ">
<group delimiter=". ">
<text variable="container-title"/>
<group delimiter=" ">
<text term="section" form="short" text-case="capitalize-first"/>
<text variable="section"/>
</group>
</group>
<text variable="number"/>
</group>
</else-if>
<else-if type="speech">
<group delimiter=": " suffix=";">
<group delimiter=" ">
<text variable="genre" text-case="capitalize-first"/>
<text term="presented at"/>
</group>
<text variable="event"/>
</group>
</else-if>
<else>
<group delimiter=", " suffix=".">
<choose>
<if variable="collection-title" match="none">
<group delimiter=" ">
<label variable="volume" form="short" text-case="capitalize-first"/>
<text variable="volume"/>
</group>
</if>
</choose>
<text variable="container-title"/>
</group>
</else>
</choose>
</macro>
<macro name="title">
<choose>
<if type="webpage" variable="container-title" match="all"/>
<else>
<text variable="title"/>
<choose>
<if type="article-journal article-magazine chapter paper-conference article-newspaper review review-book entry-dictionary entry-encyclopedia" match="none">
<choose>
<if variable="URL">
<text term="internet" prefix=" [" suffix="]" text-case="capitalize-first"/>
</if>
</choose>
<text macro="edition" prefix=". "/>
</if>
</choose>
</else>
</choose>
<choose>
<if type="thesis">
<text variable="genre" prefix=" [" suffix="]"/>
</if>
</choose>
</macro>
<macro name="edition">
<choose>
<if is-numeric="edition">
<group delimiter=" ">
<number variable="edition" form="ordinal"/>
<text term="edition" form="short"/>
</group>
</if>
<else>
<text variable="edition" suffix="."/>
</else>
</choose>
</macro>
<macro name="date">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
<group suffix=";" delimiter=" ">
<date variable="issued" form="text"/>
<text macro="accessed-date"/>
</group>
</if>
<else-if type="bill legislation" match="any">
<group delimiter=", ">
<date variable="issued" delimiter=" ">
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<date variable="issued">
<date-part name="year"/>
</date>
</group>
</else-if>
<else-if type="report">
<date variable="issued" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
</date>
<text macro="accessed-date" prefix=" "/>
</else-if>
<else-if type="patent">
<group suffix=".">
<group delimiter=", ">
<text variable="number"/>
<date variable="issued">
<date-part name="year"/>
</date>
</group>
<text macro="accessed-date" prefix=" "/>
</group>
</else-if>
<else-if type="speech">
<group delimiter="; ">
<group delimiter=" ">
<date variable="issued" delimiter=" ">
<date-part name="year"/>
<date-part name="month" form="short" strip-periods="true"/>
<date-part name="day"/>
</date>
<text macro="accessed-date"/>
</group>
<text variable="event-place"/>
</group>
</else-if>
<else>
<group suffix=".">
<date variable="issued">
<date-part name="year"/>
</date>
<text macro="accessed-date" prefix=" "/>
</group>
</else>
</choose>
</macro>
<macro name="pages">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="any">
<text variable="page" prefix=":"/>
</if>
<else-if type="book" match="any">
<text variable="number-of-pages" prefix=" "/>
<choose>
<if is-numeric="number-of-pages">
<label variable="number-of-pages" form="short" prefix=" " plural="never"/>
</if>
</choose>
</else-if>
<else>
<group prefix=" " delimiter=" ">
<label variable="page" form="short" plural="never"/>
<text variable="page"/>
</group>
</else>
</choose>
</macro>
<macro name="journal-location">
<choose>
<if type="article-journal article-magazine review review-book" match="any">
<text variable="volume"/>
<text variable="issue" prefix="(" suffix=")"/>
</if>
</choose>
</macro>
<macro name="webpage-part">
<choose>
<if type="webpage" variable="container-title" match="all">
<text variable="title"/>
</if>
</choose>
</macro>
<macro name="collection-details">
<choose>
<if type="article-journal article-magazine article-newspaper review review-book" match="none">
<choose>
<if variable="collection-title">
<group delimiter=" " prefix="(" suffix=")">
<names variable="collection-editor" suffix=".">
<name sort-separator=" " initialize-with="" name-as-sort-order="all" delimiter=", " delimiter-precedes-last="always"/>
<label form="long" prefix=", "/>
</names>
<group delimiter="; ">
<text variable="collection-title"/>
<group delimiter=" ">
<label variable="volume" form="short"/>
<text variable="volume"/>
</group>
</group>
</group>
</if>
</choose>
</if>
</choose>
</macro>
<macro name="report-details">
<choose>
<if type="report">
<text variable="number" prefix="Report No.: "/>
</if>
</choose>
</macro>
<citation collapse="citation-number">
<sort>
<key variable="citation-number"/>
</sort>
<layout prefix="(" suffix=")" delimiter=",">
<text variable="citation-number"/>
</layout>
</citation>
<bibliography et-al-min="7" et-al-use-first="6" second-field-align="flush">
<layout>
<text variable="citation-number" suffix="."/>
<group delimiter=". " suffix=". ">
<text macro="author"/>
<text macro="title"/>
</group>
<group delimiter=" " suffix=". ">
<group delimiter=": ">
<text macro="chapter-marker"/>
<group delimiter=" ">
<text macro="editor"/>
<text macro="container-title"/>
</group>
</group>
<text macro="publisher"/>
<group>
<text macro="date"/>
<text macro="journal-location"/>
<text macro="pages"/>
</group>
<text macro="webpage-part"/>
</group>
<text macro="collection-details" suffix=". "/>
<text macro="report-details" suffix=". "/>
<text macro="access"/>
</layout>
</bibliography>
</style>