f36b095ce2
Formalize the two-phase MPHF indexing architecture and update Phase 6 to use `evidence.bin` for direct kmer extraction. Simplify the evidence and unitig storage layouts to flat packed formats enabling O(1) random access. Introduce aggregation traits (`ColumnWeights`, `CountPartials`, `BitPartials`) to support additive distance metric decomposition across partitions. Narrow the documented scope from metagenomic to individual genome datasets, and replace speculative open questions with concrete implementation specifications.
1274 lines
31 KiB
HTML
1274 lines
31 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
<link rel="prev" href="../chunkreader/">
|
||
|
||
|
||
<link rel="next" href="../obipipeline/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>Construction pipeline - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#construction-pipeline" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
Construction pipeline
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-0-parameter-estimation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 0 — Parameter estimation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-scatter" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — Scatter
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-dereplication" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — Dereplication
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-3-per-kmer-count-aggregation-and-quorum-filtering" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 3 — Per-kmer count aggregation and quorum filtering
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-4-super-kmer-compaction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 4 — Super-kmer compaction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-5-local-de-bruijn-graph-and-unitig-construction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 5 — Local de Bruijn graph and unitig construction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-6-mphf-construction-and-index-finalisation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 6 — MPHF construction and index finalisation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../mphf/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obilayeredmap/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obilayeredmap crate
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_compact_int_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_bit_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentBitVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/index_architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer index
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-0-parameter-estimation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 0 — Parameter estimation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-scatter" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — Scatter
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-dereplication" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — Dereplication
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-3-per-kmer-count-aggregation-and-quorum-filtering" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 3 — Per-kmer count aggregation and quorum filtering
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-4-super-kmer-compaction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 4 — Super-kmer compaction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-5-local-de-bruijn-graph-and-unitig-construction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 5 — Local de Bruijn graph and unitig construction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-6-mphf-construction-and-index-finalisation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 6 — MPHF construction and index finalisation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="construction-pipeline">Construction pipeline</h1>
|
||
<p>All phases after scatter are embarrassingly parallel across partitions.</p>
|
||
<h2 id="phase-0-parameter-estimation">Phase 0 — Parameter estimation</h2>
|
||
<p>The construction parameters p, n, and min_count depend on the kmer frequency spectrum of the dataset. Estimating this spectrum before construction avoids costly re-partitioning if p is badly chosen.</p>
|
||
<p>Two approaches are supported:</p>
|
||
<ul>
|
||
<li><strong>External estimation (preferred):</strong> run <a href="https://github.com/bcgsc/ntCard">NT-CARD</a> on the input files and pass its histogram output to <code>obikmer build</code>. NT-CARD produces a kmer frequency histogram in a single streaming pass using ntHash and a Flajolet-Martin-style estimator; obikmer reads this file and derives p, n, and min_count automatically.</li>
|
||
<li><strong>Internal estimation (future):</strong> an <code>obikmer estimate</code> subcommand for users who prefer a single-tool workflow. The implementation would combine two components: (1) <strong>ntHash</strong>, a rolling hash that updates the kmer hash in O(1) per nucleotide by incrementally adding the incoming base and removing the outgoing one — Rust crates exist; (2) a <strong>Flajolet-Martin-style streaming estimator</strong> that maintains a small table of minimum hash values and infers the frequency histogram from their statistical distribution, as described in the NT-CARD paper (Mohamadi <em>et al.</em> 2017)<sup id="fnref:Mohamadi2017-ok"><a class="footnote-ref" href="#fn:Mohamadi2017-ok">1</a></sup>.</li>
|
||
</ul>
|
||
<p>The histogram gives:</p>
|
||
<ul>
|
||
<li><strong>F0</strong> (number of distinct kmers) → sets p (target ~10M kmers/partition → p = ⌈log₂(F0 / 10M)⌉)</li>
|
||
<li><strong>frequency distribution</strong> → sets n (choose n so that fewer than 1% of kmers overflow)</li>
|
||
<li><strong>error valley</strong> → suggests min_count (typically the local minimum between the error peak and the coverage peak)</li>
|
||
</ul>
|
||
<h2 id="phase-1-scatter">Phase 1 — Scatter</h2>
|
||
<p>Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:</p>
|
||
<ol>
|
||
<li><strong>Ambiguous base filter</strong>: cut at any non-ACGT base; discard fragments shorter than k.</li>
|
||
<li><strong>Entropy filter</strong>: scan each fragment with a sliding window of size k. When the kmer <span class="arithmatex">\(K_i = S[i \mathinner{..} i+k-1]\)</span> ended by nucleotide <span class="arithmatex">\(S[j]\)</span> (with <span class="arithmatex">\(j = i+k-1\)</span>) has entropy below threshold <span class="arithmatex">\(\theta\)</span>, emit the current segment and start a new one (see algorithm below). <span class="arithmatex">\(K_i\)</span> belongs to neither segment, and no valid kmer is lost.</li>
|
||
<li><strong>Length filter</strong>: discard any segment shorter than k produced by step 2.</li>
|
||
<li><strong>Super-kmer extraction</strong>: for each clean segment, slide a minimizer window and group consecutive kmers sharing the same canonical minimizer; canonise each super-kmer by lexicographic comparison with its reverse complement (early exit).</li>
|
||
<li><strong>Partition routing</strong>: <code>hash(canonical_minimizer) → PART</code> → append super-kmer to <code>partition/superkmers.bin.gz</code>.</li>
|
||
</ol>
|
||
<p><strong>Segmentation behavior:</strong></p>
|
||
<p>When <span class="arithmatex">\(K_i\)</span> (ended by <span class="arithmatex">\(S[j]\)</span>, <span class="arithmatex">\(j = i+k-1\)</span>) fails the entropy threshold:</p>
|
||
<ul>
|
||
<li>Current segment <span class="arithmatex">\(S[\textit{seg_start} \mathinner{..} j-1]\)</span> is emitted (last valid kmer = <span class="arithmatex">\(K_{i-1}\)</span>)</li>
|
||
<li>New segment starts at <span class="arithmatex">\(S[i+1]\)</span> (first new kmer = <span class="arithmatex">\(K_{i+1}\)</span>)</li>
|
||
<li><span class="arithmatex">\(K_i\)</span> is excluded: current segment lacks <span class="arithmatex">\(S[j]\)</span>, new segment lacks <span class="arithmatex">\(S[i]\)</span></li>
|
||
<li>Overlap = <span class="arithmatex">\(S[i+1 \mathinner{..} j-1]\)</span> = <span class="arithmatex">\(k-2\)</span> nucleotides</li>
|
||
</ul>
|
||
<div class="admonition abstract">
|
||
<p class="admonition-title">Algorithm — Entropy filter: sliding window segmentation</p>
|
||
<div class="highlight"><pre><span></span><code>procedure EntropyFilter(S, N, k, θ):
|
||
seg_start ← 0
|
||
window ← []
|
||
for j ← 0 to N−1:
|
||
window.push(S[j])
|
||
if |window| < k: continue
|
||
i ← j − k + 1
|
||
if entropy(window) ≤ θ:
|
||
emit S[seg_start .. j−1]
|
||
seg_start ← i + 1
|
||
window ← S[i+1 .. j]
|
||
else:
|
||
window.pop_front()
|
||
emit S[seg_start .. N−1]
|
||
</code></pre></div>
|
||
</div>
|
||
<p>Writes are sequential and append-only — IO-friendly. Gzip applied at write time. Data volume ≈ raw genome size (2 bits/nt compaction offsets header overhead).</p>
|
||
<h2 id="phase-2-dereplication">Phase 2 — Dereplication</h2>
|
||
<p>Performed independently per partition. Identical super-kmers are consolidated and their COUNT accumulated — analogous to amplicon dereplication in metabarcoding. Uses external bucket sort to stay within RAM bounds:</p>
|
||
<p><strong>Pass 1</strong> (streaming): hash the nucleotide payload of each super-kmer, route to one of B bucket files:
|
||
<div class="highlight"><pre><span></span><code>hash(sequence) % B → bucket_i.bin
|
||
</code></pre></div>
|
||
B ≈ 100 is tunable; RAM needed ≈ partition_size / B.</p>
|
||
<p><strong>Pass 2</strong>: for each bucket, load into an in-memory <code>HashMap<sequence, COUNT></code>, dereplicate by summing COUNT values, write consolidated super-kmers.</p>
|
||
<p>After dereplication: at Nx coverage the partition shrinks by ~x (errors aside). The COUNT field in each super-kmer header = number of times that exact super-kmer sequence was observed across all input reads.</p>
|
||
<p><strong>Important:</strong> super-kmer COUNT ≠ individual kmer count. A kmer can appear in multiple distinct super-kmers (same partition, different flanking context); its true count = sum of COUNT of all super-kmers containing it. A super-kmer with COUNT=1 may contain only high-abundance kmers, each appearing in many other super-kmers. Abundance filtering therefore cannot be applied at this phase.</p>
|
||
<h2 id="phase-3-per-kmer-count-aggregation-and-quorum-filtering">Phase 3 — Per-kmer count aggregation and quorum filtering</h2>
|
||
<p>For each dereplicated super-kmer, enumerate its kmers and accumulate counts:</p>
|
||
<div class="highlight"><pre><span></span><code>for each super-kmer (sequence, COUNT):
|
||
for each kmer in sequence:
|
||
kmer_counts[canonical(kmer)] += COUNT
|
||
</code></pre></div>
|
||
<p>Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.</p>
|
||
<p>Abundance filter applied here: kmers with <code>total_count < q</code> are discarded. <code>q</code> is a collection parameter (0 = keep all, including singletons for ≤1x data).</p>
|
||
<p>No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.</p>
|
||
<h2 id="phase-4-super-kmer-compaction">Phase 4 — Super-kmer compaction</h2>
|
||
<p>The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:</p>
|
||
<div class="highlight"><pre><span></span><code>for each dereplicated super-kmer:
|
||
scan kmer by kmer
|
||
kmer not in valid set → break point (terminates current super-kmer)
|
||
kmer in valid set → extend current super-kmer
|
||
</code></pre></div>
|
||
<p>Three cases per super-kmer:</p>
|
||
<ul>
|
||
<li><strong>All kmers valid</strong> → copied as-is</li>
|
||
<li><strong>No kmer valid</strong> → discarded</li>
|
||
<li><strong>Mixed</strong> → split into sub-super-kmers at invalid boundaries; each sub-super-kmer inherits the original COUNT</li>
|
||
</ul>
|
||
<p>After splitting, re-apply dereplication (bucket sort, phase 2 method) — splitting can produce new identical super-kmers. This re-dereplication is cheap: the volume is already greatly reduced.</p>
|
||
<p>Output: a clean super-kmer file where every kmer passes quorum. This file feeds phase 5.</p>
|
||
<h2 id="phase-5-local-de-bruijn-graph-and-unitig-construction">Phase 5 — Local de Bruijn graph and unitig construction</h2>
|
||
<p>Within each partition, build a <strong>local de Bruijn graph</strong> from the valid kmer set and compute its unitigs. All operations are local to the partition — no cross-partition communication.</p>
|
||
<div class="highlight"><pre><span></span><code>valid kmers → HashSet<u64>
|
||
|
||
for each kmer K:
|
||
out_degree = |{K[1:]+b | b ∈ {A,C,G,T}} ∩ HashSet|
|
||
in_degree = |{b+K[:-1] | b ∈ {A,C,G,T}} ∩ HashSet|
|
||
|
||
internal node ↔ in_degree=1 AND out_degree=1
|
||
branching / dead-end → unitig start or end
|
||
</code></pre></div>
|
||
<p>Traverse non-branching paths to assemble unitigs. Kmers whose neighbours fall in other partitions appear as dead ends locally — they terminate the unitig. The result: <strong>each kmer appears in exactly one unitig</strong> within the partition.</p>
|
||
<p>The partition size (controlled by p) must be calibrated so that the HashSet fits in RAM during this phase.</p>
|
||
<p>Output: <code>unitigs.bin</code> — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.</p>
|
||
<p><strong>Scope of local unitigs:</strong> these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.</p>
|
||
<h2 id="phase-6-mphf-construction-and-index-finalisation">Phase 6 — MPHF construction and index finalisation</h2>
|
||
<p>Built once on the definitive kmer set (all kmers in all unitigs of the partition). See <a href="../obilayeredmap/">obilayeredmap</a> and <a href="../mphf/">MPHF selection</a> for the current implementation.</p>
|
||
<div class="highlight"><pre><span></span><code>kmers from unitigs → MPHF → mphf.bin
|
||
→ evidence.bin : n × u32, each = (chunk_id: 25 bits | rank: 7 bits)
|
||
→ payload : counts/ (mode 2) or presence/ (mode 3)
|
||
</code></pre></div>
|
||
<p>The MPHF is built in two passes over <code>unitigs.bin</code>: parallel pass for <code>mphf.bin</code>, sequential pass for <code>evidence.bin</code> and payload. The exact kmer count is available from the unitig index (<code>unitigs.bin.idx</code>) before the passes begin.</p>
|
||
<p><strong>Exact verification via unitig evidence:</strong></p>
|
||
<p><code>unitigs.bin</code> serves as the evidence structure. The MPHF maps every input to <code>[0, N)</code> including absent kmers — the unitig read-back (via <code>evidence.bin</code>) is the only correct membership test.</p>
|
||
<div class="highlight"><pre><span></span><code>query kmer q
|
||
→ canonical_minimizer(q) → hash → PART → part_XXXXX/
|
||
→ MPHF(q) → slot s
|
||
→ evidence[s] = (chunk_id, rank)
|
||
→ read k nucleotides at rank in unitigs[chunk_id] → compare with q
|
||
→ match : return payload[s] ← exact hit
|
||
→ no match: kmer absent ← MPHF collision on absent kmer
|
||
</code></pre></div>
|
||
<p><code>superkmers.bin.gz</code> is no longer needed at this point and can be deleted.</p>
|
||
<div class="footnote">
|
||
<hr />
|
||
<ol>
|
||
<li id="fn:Mohamadi2017-ok">
|
||
<p>Mohamadi, H., Khan, H. & Birol, I. (2017). <a href="https://doi.org/10.1093/bioinformatics/btw832">ntCard: A streaming algorithm for cardinality estimation in genomics data</a>. <em>Bioinformatics (Oxford, England)</em>, 33, 1324--1330. <a class="footnote-backref" href="#fnref:Mohamadi2017-ok" title="Jump back to footnote 1 in the text">↩</a></p>
|
||
</li>
|
||
</ol>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |