27f5e88a7b
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
1162 lines
30 KiB
HTML
1162 lines
30 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
<link rel="prev" href="../chunkreader/">
|
||
|
||
|
||
<link rel="next" href="../obipipeline/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>Construction pipeline - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#construction-pipeline" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
Construction pipeline
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-0-parameter-estimation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 0 — Parameter estimation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-scatter" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — Scatter
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-dereplication" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — Dereplication
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-3-per-kmer-count-aggregation-and-quorum-filtering" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 3 — Per-kmer count aggregation and quorum filtering
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-4-super-kmer-compaction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 4 — Super-kmer compaction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-5-local-de-bruijn-graph-and-unitig-construction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 5 — Local de Bruijn graph and unitig construction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-6-mphf-construction-and-index-finalisation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 6 — MPHF construction and index finalisation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../mphf/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-0-parameter-estimation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 0 — Parameter estimation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-scatter" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — Scatter
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-dereplication" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — Dereplication
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-3-per-kmer-count-aggregation-and-quorum-filtering" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 3 — Per-kmer count aggregation and quorum filtering
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-4-super-kmer-compaction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 4 — Super-kmer compaction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-5-local-de-bruijn-graph-and-unitig-construction" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 5 — Local de Bruijn graph and unitig construction
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-6-mphf-construction-and-index-finalisation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 6 — MPHF construction and index finalisation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="construction-pipeline">Construction pipeline</h1>
|
||
<p>All phases after scatter are embarrassingly parallel across partitions.</p>
|
||
<h2 id="phase-0-parameter-estimation">Phase 0 — Parameter estimation</h2>
|
||
<p>The construction parameters p, n, and min_count depend on the kmer frequency spectrum of the dataset. Estimating this spectrum before construction avoids costly re-partitioning if p is badly chosen.</p>
|
||
<p>Two approaches are supported:</p>
|
||
<ul>
|
||
<li><strong>External estimation (preferred):</strong> run <a href="https://github.com/bcgsc/ntCard">NT-CARD</a> on the input files and pass its histogram output to <code>obikmer build</code>. NT-CARD produces a kmer frequency histogram in a single streaming pass using ntHash and a Flajolet-Martin-style estimator; obikmer reads this file and derives p, n, and min_count automatically.</li>
|
||
<li><strong>Internal estimation (future):</strong> an <code>obikmer estimate</code> subcommand for users who prefer a single-tool workflow. The implementation would combine two components: (1) <strong>ntHash</strong>, a rolling hash that updates the kmer hash in O(1) per nucleotide by incrementally adding the incoming base and removing the outgoing one — Rust crates exist; (2) a <strong>Flajolet-Martin-style streaming estimator</strong> that maintains a small table of minimum hash values and infers the frequency histogram from their statistical distribution, as described in the NT-CARD paper (Mohamadi <em>et al.</em> 2017)<sup id="fnref:Mohamadi2017-ok"><a class="footnote-ref" href="#fn:Mohamadi2017-ok">1</a></sup>.</li>
|
||
</ul>
|
||
<p>The histogram gives:</p>
|
||
<ul>
|
||
<li><strong>F0</strong> (number of distinct kmers) → sets p (target ~10M kmers/partition → p = ⌈log₂(F0 / 10M)⌉)</li>
|
||
<li><strong>frequency distribution</strong> → sets n (choose n so that fewer than 1% of kmers overflow)</li>
|
||
<li><strong>error valley</strong> → suggests min_count (typically the local minimum between the error peak and the coverage peak)</li>
|
||
</ul>
|
||
<h2 id="phase-1-scatter">Phase 1 — Scatter</h2>
|
||
<p>Single streaming pass over raw input files (FASTA/FASTQ, gzip). FASTQ quality scores are ignored. For each read:</p>
|
||
<ol>
|
||
<li><strong>Ambiguous base filter</strong>: cut at any non-ACGT base; discard fragments shorter than k.</li>
|
||
<li><strong>Entropy filter</strong>: scan each fragment with a sliding window of size k. When the kmer <span class="arithmatex">\(K_i = S[i \mathinner{..} i+k-1]\)</span> ended by nucleotide <span class="arithmatex">\(S[j]\)</span> (with <span class="arithmatex">\(j = i+k-1\)</span>) has entropy below threshold <span class="arithmatex">\(\theta\)</span>, emit the current segment and start a new one (see algorithm below). <span class="arithmatex">\(K_i\)</span> belongs to neither segment, and no valid kmer is lost.</li>
|
||
<li><strong>Length filter</strong>: discard any segment shorter than k produced by step 2.</li>
|
||
<li><strong>Super-kmer extraction</strong>: for each clean segment, slide a minimizer window and group consecutive kmers sharing the same canonical minimizer; canonise each super-kmer by lexicographic comparison with its reverse complement (early exit).</li>
|
||
<li><strong>Partition routing</strong>: <code>hash(canonical_minimizer) → PART</code> → append super-kmer to <code>partition/superkmers.bin.gz</code>.</li>
|
||
</ol>
|
||
<p><strong>Segmentation behavior:</strong></p>
|
||
<p>When <span class="arithmatex">\(K_i\)</span> (ended by <span class="arithmatex">\(S[j]\)</span>, <span class="arithmatex">\(j = i+k-1\)</span>) fails the entropy threshold:</p>
|
||
<ul>
|
||
<li>Current segment <span class="arithmatex">\(S[\textit{seg_start} \mathinner{..} j-1]\)</span> is emitted (last valid kmer = <span class="arithmatex">\(K_{i-1}\)</span>)</li>
|
||
<li>New segment starts at <span class="arithmatex">\(S[i+1]\)</span> (first new kmer = <span class="arithmatex">\(K_{i+1}\)</span>)</li>
|
||
<li><span class="arithmatex">\(K_i\)</span> is excluded: current segment lacks <span class="arithmatex">\(S[j]\)</span>, new segment lacks <span class="arithmatex">\(S[i]\)</span></li>
|
||
<li>Overlap = <span class="arithmatex">\(S[i+1 \mathinner{..} j-1]\)</span> = <span class="arithmatex">\(k-2\)</span> nucleotides</li>
|
||
</ul>
|
||
<div class="admonition abstract">
|
||
<p class="admonition-title">Algorithm — Entropy filter: sliding window segmentation</p>
|
||
<div class="highlight"><pre><span></span><code>procedure EntropyFilter(S, N, k, θ):
|
||
seg_start ← 0
|
||
window ← []
|
||
for j ← 0 to N−1:
|
||
window.push(S[j])
|
||
if |window| < k: continue
|
||
i ← j − k + 1
|
||
if entropy(window) ≤ θ:
|
||
emit S[seg_start .. j−1]
|
||
seg_start ← i + 1
|
||
window ← S[i+1 .. j]
|
||
else:
|
||
window.pop_front()
|
||
emit S[seg_start .. N−1]
|
||
</code></pre></div>
|
||
</div>
|
||
<p>Writes are sequential and append-only — IO-friendly. Gzip applied at write time. Data volume ≈ raw genome size (2 bits/nt compaction offsets header overhead).</p>
|
||
<h2 id="phase-2-dereplication">Phase 2 — Dereplication</h2>
|
||
<p>Performed independently per partition. Identical super-kmers are consolidated and their COUNT accumulated — analogous to amplicon dereplication in metabarcoding. Uses external bucket sort to stay within RAM bounds:</p>
|
||
<p><strong>Pass 1</strong> (streaming): hash the nucleotide payload of each super-kmer, route to one of B bucket files:
|
||
<div class="highlight"><pre><span></span><code>hash(sequence) % B → bucket_i.bin
|
||
</code></pre></div>
|
||
B ≈ 100 is tunable; RAM needed ≈ partition_size / B.</p>
|
||
<p><strong>Pass 2</strong>: for each bucket, load into an in-memory <code>HashMap<sequence, COUNT></code>, dereplicate by summing COUNT values, write consolidated super-kmers.</p>
|
||
<p>After dereplication: at Nx coverage the partition shrinks by ~x (errors aside). The COUNT field in each super-kmer header = number of times that exact super-kmer sequence was observed across all input reads.</p>
|
||
<p><strong>Important:</strong> super-kmer COUNT ≠ individual kmer count. A kmer can appear in multiple distinct super-kmers (same partition, different flanking context); its true count = sum of COUNT of all super-kmers containing it. A super-kmer with COUNT=1 may contain only high-abundance kmers, each appearing in many other super-kmers. Abundance filtering therefore cannot be applied at this phase.</p>
|
||
<h2 id="phase-3-per-kmer-count-aggregation-and-quorum-filtering">Phase 3 — Per-kmer count aggregation and quorum filtering</h2>
|
||
<p>For each dereplicated super-kmer, enumerate its kmers and accumulate counts:</p>
|
||
<div class="highlight"><pre><span></span><code>for each super-kmer (sequence, COUNT):
|
||
for each kmer in sequence:
|
||
kmer_counts[canonical(kmer)] += COUNT
|
||
</code></pre></div>
|
||
<p>Implemented as an external sort or a temporary HashMap, depending on partition size. At the end of this phase, each distinct canonical kmer has its exact total count.</p>
|
||
<p>Abundance filter applied here: kmers with <code>total_count < q</code> are discarded. <code>q</code> is a collection parameter (0 = keep all, including singletons for ≤1x data).</p>
|
||
<p>No pre-filter on super-kmer COUNT is possible at phase 2: a super-kmer with COUNT=1 may contain only high-abundance kmers, each present in many other super-kmers across the partition.</p>
|
||
<h2 id="phase-4-super-kmer-compaction">Phase 4 — Super-kmer compaction</h2>
|
||
<p>The valid kmer set from phase 3 is used as a mask to rewrite the super-kmer files:</p>
|
||
<div class="highlight"><pre><span></span><code>for each dereplicated super-kmer:
|
||
scan kmer by kmer
|
||
kmer not in valid set → break point (terminates current super-kmer)
|
||
kmer in valid set → extend current super-kmer
|
||
</code></pre></div>
|
||
<p>Three cases per super-kmer:</p>
|
||
<ul>
|
||
<li><strong>All kmers valid</strong> → copied as-is</li>
|
||
<li><strong>No kmer valid</strong> → discarded</li>
|
||
<li><strong>Mixed</strong> → split into sub-super-kmers at invalid boundaries; each sub-super-kmer inherits the original COUNT</li>
|
||
</ul>
|
||
<p>After splitting, re-apply dereplication (bucket sort, phase 2 method) — splitting can produce new identical super-kmers. This re-dereplication is cheap: the volume is already greatly reduced.</p>
|
||
<p>Output: a clean super-kmer file where every kmer passes quorum. This file feeds phase 5.</p>
|
||
<h2 id="phase-5-local-de-bruijn-graph-and-unitig-construction">Phase 5 — Local de Bruijn graph and unitig construction</h2>
|
||
<p>Within each partition, build a <strong>local de Bruijn graph</strong> from the valid kmer set and compute its unitigs. All operations are local to the partition — no cross-partition communication.</p>
|
||
<div class="highlight"><pre><span></span><code>valid kmers → HashSet<u64>
|
||
|
||
for each kmer K:
|
||
out_degree = |{K[1:]+b | b ∈ {A,C,G,T}} ∩ HashSet|
|
||
in_degree = |{b+K[:-1] | b ∈ {A,C,G,T}} ∩ HashSet|
|
||
|
||
internal node ↔ in_degree=1 AND out_degree=1
|
||
branching / dead-end → unitig start or end
|
||
</code></pre></div>
|
||
<p>Traverse non-branching paths to assemble unitigs. Kmers whose neighbours fall in other partitions appear as dead ends locally — they terminate the unitig. The result: <strong>each kmer appears in exactly one unitig</strong> within the partition.</p>
|
||
<p>The partition size (controlled by p) must be calibrated so that the HashSet fits in RAM during this phase.</p>
|
||
<p>Output: <code>unitigs.bin</code> — the permanent evidence structure for the partition. Each kmer in the partition appears at exactly one (unitig_id, offset) location.</p>
|
||
<p><strong>Scope of local unitigs:</strong> these are unitigs of the partition's local de Bruijn graph, not global unitigs. A kmer whose k-1 successor or predecessor falls in another partition appears as a dead end locally and terminates the unitig. This does not affect correctness of verification but means partition-local unitigs cannot be directly reused for global assembly.</p>
|
||
<h2 id="phase-6-mphf-construction-and-index-finalisation">Phase 6 — MPHF construction and index finalisation</h2>
|
||
<p>Built once on the definitive kmer set (all kmers in all unitigs of the partition):</p>
|
||
<div class="highlight"><pre><span></span><code>kmers from unitigs → MPHF → mphf.bin
|
||
→ counts.bin : packed n-bit array (or 1-bit for presence mode)
|
||
→ refs.bin : u32 nucleotide offset into unitigs.bin per kmer
|
||
</code></pre></div>
|
||
<p>The MPHF is built once — no rebuild. The n-bit width for <code>counts.bin</code> is chosen from the observed count distribution (n=5 covers ~97% of kmers at 15x; n=1 for presence mode). Counts exceeding 2ⁿ−1 go into <code>overflow.bin</code> as sorted <code>(mphf_index: u32, count: u32)</code> pairs.</p>
|
||
<p><strong>Exact verification via unitig evidence:</strong></p>
|
||
<p><code>unitigs.bin</code> serves as the evidence structure: for any query kmer, the stored unitig provides the ground truth to confirm or deny its presence. The MPHF maps every input to [0, N) including absent kmers — the unitig read-back is the only way to guarantee exactness.</p>
|
||
<div class="highlight"><pre><span></span><code>query kmer q
|
||
→ canonical_minimizer(q) → hash → PART → part_XXXX/
|
||
→ MPHF(q) → index i
|
||
→ refs[i] = (unitig_id, kmer_offset)
|
||
→ read unitig from unitigs.bin → extract kmer at kmer_offset → compare with q
|
||
→ match : return counts[i] ← exact hit
|
||
→ no match: kmer absent ← MPHF collision on absent kmer
|
||
</code></pre></div>
|
||
<p>One random disk access into <code>unitigs.bin</code> per query; the unitig is the minimal, non-redundant evidence (each kmer stored once). <code>superkmers.bin.gz</code> is no longer needed at this point and can be deleted.</p>
|
||
<div class="footnote">
|
||
<hr />
|
||
<ol>
|
||
<li id="fn:Mohamadi2017-ok">
|
||
<p>Mohamadi, H., Khan, H. & Birol, I. (2017). <a href="https://doi.org/10.1093/bioinformatics/btw832">ntCard: A streaming algorithm for cardinality estimation in genomics data</a>. <em>Bioinformatics (Oxford, England)</em>, 33, 1324--1330. <a class="footnote-backref" href="#fnref:Mohamadi2017-ok" title="Jump back to footnote 1 in the text">↩</a></p>
|
||
</li>
|
||
</ol>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |