f36b095ce2
Formalize the two-phase MPHF indexing architecture and update Phase 6 to use `evidence.bin` for direct kmer extraction. Simplify the evidence and unitig storage layouts to flat packed formats enabling O(1) random access. Introduce aggregation traits (`ColumnWeights`, `CountPartials`, `BitPartials`) to support additive distance metric decomposition across partitions. Narrow the documented scope from metagenomic to individual genome datasets, and replace speculative open questions with concrete implementation specifications.
1404 lines
31 KiB
HTML
1404 lines
31 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
<link rel="prev" href="../storage/">
|
||
|
||
|
||
<link rel="next" href="../unitig_evidence/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>MPHF selection - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#mphf-selection-two-phase-indexing-architecture" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
MPHF selection
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../pipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#why-two-phases-are-needed" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Why two phases are needed
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Why two phases are needed">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-provisional-mphf-kmer-spectrum" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — provisional MPHF + kmer spectrum
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-definitive-mphf" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — definitive MPHF
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#mphf-candidates" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
MPHF candidates
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#mphf-choice-per-phase" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
MPHF choice per phase
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#space-at-scale" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Space at scale
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#ptr_hash-configuration-phase-2" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
ptr_hash configuration (phase 2)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#multilayer-index-architecture" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Multilayer index architecture
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Multilayer index architecture">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#layer-structure" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Layer structure
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#membership-verification" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Membership verification
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#query-algorithm" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Query algorithm
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#merging-layers" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Merging layers
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obilayeredmap/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obilayeredmap crate
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_compact_int_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_bit_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentBitVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/index_architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer index
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#why-two-phases-are-needed" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Why two phases are needed
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Why two phases are needed">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-1-provisional-mphf-kmer-spectrum" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 1 — provisional MPHF + kmer spectrum
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#phase-2-definitive-mphf" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Phase 2 — definitive MPHF
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#mphf-candidates" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
MPHF candidates
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#mphf-choice-per-phase" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
MPHF choice per phase
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#space-at-scale" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Space at scale
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#ptr_hash-configuration-phase-2" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
ptr_hash configuration (phase 2)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#multilayer-index-architecture" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Multilayer index architecture
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Multilayer index architecture">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#layer-structure" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Layer structure
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#membership-verification" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Membership verification
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#query-algorithm" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Query algorithm
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#merging-layers" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Merging layers
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="mphf-selection-two-phase-indexing-architecture">MPHF selection — two-phase indexing architecture</h1>
|
||
<h2 id="why-two-phases-are-needed">Why two phases are needed</h2>
|
||
<p>Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of surviving unique kmers is not known until after counting and filtering low-abundance kmers.</p>
|
||
<h3 id="phase-1-provisional-mphf-kmer-spectrum">Phase 1 — provisional MPHF + kmer spectrum</h3>
|
||
<p>Implemented in <code>obikpartitionner::KmerPartition::count_kmer()</code>.</p>
|
||
<ol>
|
||
<li><strong>Pass 1</strong>: read the dereplicated superkmer file; enumerate all unique canonical kmers into a <code>HashSet</code>. Exact count known after this pass.</li>
|
||
<li><strong>Build a provisional MPHF</strong> (<code>GOFunction</code> from the <code>ph</code> crate) over the exact kmer set. Produces <code>mphf1.bin</code>.</li>
|
||
<li><strong>Create <code>counts1.bin</code></strong>: one zero-initialised <code>u32</code> per MPHF slot (mmap'd).</li>
|
||
<li><strong>Pass 2</strong>: re-read the dereplicated file; for each kmer, query <code>mphf1.get(kmer)</code> and atomically accumulate the superkmer count into <code>counts1[slot]</code>.</li>
|
||
<li><strong>Build kmer frequency spectrum</strong> from <code>counts1</code>: histogram <code>{count → n_kmers}</code>, totals f0 (distinct kmers) and f1 (total abundance). Written to <code>kmer_spectrum_raw.json</code> per partition, then merged globally.</li>
|
||
</ol>
|
||
<p>Files produced per partition:</p>
|
||
<div class="highlight"><pre><span></span><code>part_XXXXX/
|
||
mphf1.bin — GOFunction (provisional MPHF, discarded after phase 2)
|
||
counts1.bin — [u32; n_kmers] kmer counts, mmap'd
|
||
kmer_spectrum_raw.json — local frequency spectrum
|
||
</code></pre></div>
|
||
<h3 id="phase-2-definitive-mphf">Phase 2 — definitive MPHF</h3>
|
||
<p>After filtering (applying a min-count threshold derived from the spectrum) and building the local De Bruijn graph + unitigs (see <a href="../pipeline/">Construction pipeline</a>), the exact filtered kmer set is available via <code>unitigs.bin</code>.</p>
|
||
<p><code>MphfLayer::build</code> is called on the unitig file:</p>
|
||
<ol>
|
||
<li><strong>Pass 1</strong>: iterate all canonical kmers from <code>unitigs.bin</code> in parallel, build and store <code>mphf.bin</code> (ptr_hash).</li>
|
||
<li><strong>Pass 2</strong>: iterate sequentially, fill <code>evidence.bin</code>, call the mode-specific <code>fill_slot</code> callback.</li>
|
||
</ol>
|
||
<p><code>mphf1.bin</code> and <code>counts1.bin</code> are no longer needed after phase 2 and can be deleted.</p>
|
||
<hr />
|
||
<h2 id="mphf-candidates">MPHF candidates</h2>
|
||
<p><strong>boomphf</strong> (BBHash algorithm, maintained by 10X Genomics):</p>
|
||
<ul>
|
||
<li>~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)</li>
|
||
<li>Supports streaming construction (no exact count needed)</li>
|
||
<li>Drawback: largest space footprint; streaming advantage is irrelevant at phase 2 since the exact count is available</li>
|
||
</ul>
|
||
<p><strong>ptr_hash</strong> (PtrHash algorithm, Groot Koerkamp, SEA 2025):</p>
|
||
<ul>
|
||
<li>~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64) and fastest construction (≥3.1×)</li>
|
||
<li>Requires exact key count at construction — available at both phases after pass 1</li>
|
||
<li>Published February 2025; accepted given performance profile and the fact that each MPHF is independently rebuildable from its unitig file</li>
|
||
</ul>
|
||
<p><strong>FMPH/FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
|
||
<ul>
|
||
<li>~2.1 bits/key — most compact; good query speed; deterministic construction</li>
|
||
<li>Works well from an exact or slightly overestimated count</li>
|
||
<li><code>GOFunction</code> (group-oriented variant) is the specific type used</li>
|
||
</ul>
|
||
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
|
||
<p><strong>Phase 1</strong> (provisional, discarded after spectrum computation): <code>ph::fmph::GOFunction</code>. Compact, fast to build from the exact post-dedup kmer set. Query speed is secondary — the structure is only used during pass 2 of <code>count_kmer</code>.</p>
|
||
<p><strong>Phase 2</strong> (persistent, queried repeatedly): <strong>ptr_hash</strong>. Exact key count is available from the unitig index; ptr_hash query speed (≥2.1×) and construction speed (≥3.1× over FMPH) are the decisive factors. The 2.4 bits/key overhead is acceptable.</p>
|
||
<p>boomphf is eliminated: largest space overhead, streaming advantage does not apply.</p>
|
||
<hr />
|
||
<h2 id="space-at-scale">Space at scale</h2>
|
||
<p>For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>MPHF</th>
|
||
<th>bits/key</th>
|
||
<th>Total MPHF size</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td>boomphf</td>
|
||
<td>3.7</td>
|
||
<td>~47 GB</td>
|
||
</tr>
|
||
<tr>
|
||
<td>ptr_hash</td>
|
||
<td>2.4</td>
|
||
<td>~31 GB</td>
|
||
</tr>
|
||
<tr>
|
||
<td>FMPH</td>
|
||
<td>2.1</td>
|
||
<td>~27 GB</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>For a human genome at 30× coverage with 1 024 partitions, realistic partition sizes are 3–30 M unique kmers → 1–8 MB per phase-2 MPHF, well within RAM.</p>
|
||
<hr />
|
||
<h2 id="ptr_hash-configuration-phase-2">ptr_hash configuration (phase 2)</h2>
|
||
<div class="highlight"><pre><span></span><code><span class="k">type</span><span class="w"> </span><span class="nc">Mphf</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">PtrHash</span><span class="o"><</span>
|
||
<span class="w"> </span><span class="kt">u64</span><span class="p">,</span><span class="w"> </span><span class="c1">// key: canonical kmer raw encoding</span>
|
||
<span class="w"> </span><span class="n">CubicEps</span><span class="p">,</span><span class="w"> </span><span class="c1">// bucket fn: 2.4 bits/key, λ=3.5, α=0.99</span>
|
||
<span class="w"> </span><span class="n">CachelineEfVec</span><span class="o"><</span><span class="nb">Vec</span><span class="o"><</span><span class="n">CachelineEf</span><span class="o">>></span><span class="p">,</span><span class="w"> </span><span class="c1">// remap: 11.6 bits/entry (Elias-Fano)</span>
|
||
<span class="w"> </span><span class="n">Xx64</span><span class="p">,</span><span class="w"> </span><span class="c1">// hasher: XXH3-64 with seed</span>
|
||
<span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="kt">u8</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// pilots</span>
|
||
<span class="o">></span><span class="p">;</span>
|
||
</code></pre></div>
|
||
<p><strong>Hasher — <code>Xx64</code></strong>: canonical kmer raw values are left-aligned u64 with structural zeros in low bits (42 zeros for k=11, 2 zeros for k=31). <code>FxHash</code> (single multiply) distributes these poorly; <code>Xx64</code> (XXH3-64, seeded) handles structured input correctly.</p>
|
||
<p><strong>Bucket function — <code>CubicEps</code></strong>: λ=3.5, α=0.99. Balanced tradeoff: 2× slower construction than <code>Linear/λ=3.0</code>, 20% less space. <code>default_compact</code> (λ=4.0) saves a further 12.5% at 2× more construction time — not chosen.</p>
|
||
<p><strong>Remap — <code>CachelineEfVec</code></strong>: Elias-Fano variant packing 44 sorted 40-bit values per 64-byte cacheline (11.6 bits/value vs 32 for <code>Vec<u32></code>). One cacheline per query; space win dominates at billion-scale key counts.</p>
|
||
<hr />
|
||
<h2 id="multilayer-index-architecture">Multilayer index architecture</h2>
|
||
<h3 id="layer-structure">Layer structure</h3>
|
||
<p>Each layer is a self-contained unit. See <a href="../obilayeredmap/">obilayeredmap</a> for the full on-disk layout. The MPHF-relevant files are:</p>
|
||
<div class="highlight"><pre><span></span><code>layer_i/
|
||
unitigs.bin — packed 2-bit nucleotide sequences (kmer evidence)
|
||
mphf.bin — ptr_hash phase-2 MPHF
|
||
evidence.bin — n × u32: (chunk_id: 25 bits | rank: 7 bits) per slot
|
||
</code></pre></div>
|
||
<p>Layers are <strong>disjoint</strong>: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B:</p>
|
||
<ol>
|
||
<li>For each kmer in B: probe existing layers. If found, the kmer is already indexed.</li>
|
||
<li>Collect kmers of B not present in any layer → set <code>B \ A</code>.</li>
|
||
<li>Build layer 1 from <code>B \ A</code> (dereplicate → count → De Bruijn → unitigs → <code>MphfLayer::build</code>).</li>
|
||
</ol>
|
||
<h3 id="membership-verification">Membership verification</h3>
|
||
<p>ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry: decode the kmer from <code>(chunk_id, rank)</code> and compare to the query. A mismatch means the kmer is absent from this layer; probe the next layer.</p>
|
||
<h3 id="query-algorithm">Query algorithm</h3>
|
||
<div class="highlight"><pre><span></span><code>fn query(kmer) → Option<(layer_index, slot)>:
|
||
for (i, layer) in layers.iter().enumerate():
|
||
slot = layer.mphf.index(kmer)
|
||
if layer.evidence.decode(slot) matches kmer:
|
||
return Some((i, slot))
|
||
return None
|
||
</code></pre></div>
|
||
<p>Expected probe depth: 1 for kmers in layer 0. Each probe is a ptr_hash lookup (~10 ns) plus one evidence decode.</p>
|
||
<h3 id="merging-layers">Merging layers</h3>
|
||
<p>Two layer chains can be merged by re-indexing their union through the full pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.</p>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |