bb7adc1154
Expands MkDocs navigation and documentation for evidence elimination, the merge command, and kmer filtering. Refactors kmer representation to a generic `KmerOf<L>` type with a bitwise reverse complement algorithm. Unifies MPHF construction, introduces approximate fingerprint-based indexing, and updates the pipeline, chunkreader, and storage layouts. Adds code coverage reports and clarifies architectural invariants for improved maintainability.
1779 lines
57 KiB
HTML
1779 lines
57 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
|
|
|
|
|
|
<link rel="prev" href="../sequences/invariant/">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../../assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
|
|
|
|
|
|
|
<title>Kmer index - obikmer</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
<body dir="ltr">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#kmer-index-architecture" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<header class="md-header md-header--shadow" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
obikmer
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Kmer index
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
obikmer
|
|
</label>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../.." class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Home
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Theory
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Theory
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../kmers/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmers and super-kmers
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/encoding/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
DNA encoding
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/entropy/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Entropy filter
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/minimizer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Minimizer selection
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/indexing/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Partitioning architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Implementation
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Implementation
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/superkmer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
SuperKmer
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/kmer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Chunk reader
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/pipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Construction pipeline
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/obipipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
obipipeline library
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/storage/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
On-disk storage
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/mphf/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
MPHF selection
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Unitig evidence encoding
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Evidence elimination (discussion)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
obilayeredmap crate
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
PersistentCompactIntVec
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
PersistentBitVec
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/merge/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Merge command
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer filtering (rebuild/dump/unitig)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" checked>
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="true">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Architecture
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../sequences/invariant/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Sequences
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer index
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="./" class="md-nav__link md-nav__link--active">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer index
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#fundamental-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Fundamental invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#three-level-hierarchy" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Three-level hierarchy
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#indexconfig-and-indexmeta" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
IndexConfig and IndexMeta
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#evidencekind" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
EvidenceKind
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#mphflayer-autonomous-kmer-slot-mapping" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
MphfLayer — autonomous kmer → slot mapping
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#layerd-mphf-data-payload" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Layer\<D> — MPHF + data payload
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#datastore-slot-indexed-data" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
DataStore — slot-indexed data
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Aggregation traits — obicompactvec::traits
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#layeredstores-recursive-aggregation-wrapper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
LayeredStore\<S> — recursive aggregation wrapper
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#progressive-aggregation-principle" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Progressive aggregation principle
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#multi-genome-column-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Multi-genome column invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#query-model" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Query model
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Query model">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#point-query" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Point query
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#aggregation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Aggregation
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#parallelism-model" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Parallelism model
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#reindex-evidence-conversion-in-place" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
reindex — evidence conversion in place
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#estimate-parameter-dry-run" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
estimate — parameter dry-run
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#fundamental-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Fundamental invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#three-level-hierarchy" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Three-level hierarchy
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#indexconfig-and-indexmeta" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
IndexConfig and IndexMeta
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#evidencekind" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
EvidenceKind
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#mphflayer-autonomous-kmer-slot-mapping" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
MphfLayer — autonomous kmer → slot mapping
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#layerd-mphf-data-payload" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Layer\<D> — MPHF + data payload
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#datastore-slot-indexed-data" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
DataStore — slot-indexed data
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Aggregation traits — obicompactvec::traits
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#layeredstores-recursive-aggregation-wrapper" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
LayeredStore\<S> — recursive aggregation wrapper
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#progressive-aggregation-principle" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Progressive aggregation principle
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#multi-genome-column-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Multi-genome column invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#query-model" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Query model
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Query model">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#point-query" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Point query
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#aggregation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Aggregation
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#parallelism-model" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Parallelism model
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#reindex-evidence-conversion-in-place" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
reindex — evidence conversion in place
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#estimate-parameter-dry-run" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
estimate — parameter dry-run
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
<h1 id="kmer-index-architecture">Kmer index architecture</h1>
|
|
<h2 id="fundamental-invariant">Fundamental invariant</h2>
|
|
<p>A given canonical kmer belongs to <strong>exactly one partition</strong> and <strong>exactly one layer</strong> within that partition. This property makes all aggregation operations decomposable and parallelisable without coordination.</p>
|
|
<hr />
|
|
<h2 id="three-level-hierarchy">Three-level hierarchy</h2>
|
|
<div class="highlight"><pre><span></span><code>KmerIndex (index.meta + KmerPartition)
|
|
├── partition_0/index/ one directory per minimiser bucket
|
|
│ ├── meta.json PartitionMeta { n_layers }
|
|
│ ├── layer_0/
|
|
│ │ ├── layer_meta.json LayerMeta { evidence: EvidenceKind }
|
|
│ │ ├── mphf.bin PtrHash MPHF
|
|
│ │ ├── unitigs.bin unitig spine (never overwritten)
|
|
│ │ ├── evidence.bin exact evidence (Exact only)
|
|
│ │ ├── unitigs.bin.idx block index (Exact only)
|
|
│ │ ├── fingerprint.bin fingerprints (Approx only)
|
|
│ │ ├── counts/ PersistentCompactIntMatrix (with_counts = true)
|
|
│ │ └── presence/ PersistentBitMatrix
|
|
│ └── layer_1/
|
|
│ └── ...
|
|
└── partition_1/index/
|
|
└── ...
|
|
</code></pre></div>
|
|
<p><strong>KmerIndex</strong>: root entry point. Owns <code>IndexMeta</code> (written to <code>index.meta</code>) and a <code>KmerPartition</code> that routes canonical kmers to partition directories. All partition-level operations are dispatched in parallel via rayon.</p>
|
|
<p><strong>Partition directory</strong>: one directory per minimiser bucket. <code>PartitionMeta</code> (stored as <code>meta.json</code>) records <code>n_layers</code>. Layers within a partition cover disjoint kmer sets.</p>
|
|
<p><strong>Layer directory</strong>: one <code>MphfLayer</code> plus optional data stores. <code>LayerMeta</code> (stored as <code>layer_meta.json</code>) records which <code>EvidenceKind</code> was used. The MPHF and <code>unitigs.bin</code> are immutable once built; evidence files are the only part replaced by <code>reindex</code>.</p>
|
|
<hr />
|
|
<h2 id="indexconfig-and-indexmeta">IndexConfig and IndexMeta</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">IndexConfig</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">kmer_size</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">minimizer_size</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">n_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="c1">// log2(n_partitions)</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">with_counts</span><span class="p">:</span><span class="w"> </span><span class="kt">bool</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">evidence</span><span class="p">:</span><span class="w"> </span><span class="nc">EvidenceKind</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">block_bits</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="c1">// .idx granularity: 2^block_bits unitigs/block; 0 = one entry per unitig</span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">IndexMeta</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">version</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">config</span><span class="p">:</span><span class="w"> </span><span class="nc">IndexConfig</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">genomes</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">GenomeInfo</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// ordered; index = genome column number</span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">GenomeInfo</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">label</span><span class="p">:</span><span class="w"> </span><span class="nb">String</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="n">meta</span><span class="p">:</span><span class="w"> </span><span class="nc">HashMap</span><span class="o"><</span><span class="nb">String</span><span class="p">,</span><span class="w"> </span><span class="nb">String</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// arbitrary categorical metadata</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p><code>IndexMeta</code> is serialised as <code>index.meta</code> (JSON). It is the authority for the ordered list of genomes and for the parameters that govern all subsequent operations on the index.</p>
|
|
<hr />
|
|
<h2 id="evidencekind">EvidenceKind</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">EvidenceKind</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="n">Exact</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">b</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">},</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>Controls which files are written per layer and which query path is taken:</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Variant</th>
|
|
<th>Files written</th>
|
|
<th>False-positive rate</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>Exact</code></td>
|
|
<td><code>evidence.bin</code>, <code>unitigs.bin.idx</code></td>
|
|
<td>0</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>Approx { b, z }</code></td>
|
|
<td><code>fingerprint.bin</code></td>
|
|
<td>≈ W / 2^(b·z) per read (Findere)</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<p><code>EvidenceKind</code> is stored both in <code>IndexConfig</code> (index-wide default, updated by <code>reindex</code>) and in each <code>LayerMeta</code> (per-layer record of what was actually built).</p>
|
|
<hr />
|
|
<h2 id="mphflayer-autonomous-kmer-slot-mapping">MphfLayer — autonomous kmer → slot mapping</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">MphfLayer</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="n">mphf</span><span class="p">:</span><span class="w"> </span><span class="nc">PtrHash</span><span class="o"><</span><span class="err">…</span><span class="o">></span><span class="p">,</span>
|
|
<span class="w"> </span><span class="n">ev</span><span class="p">:</span><span class="w"> </span><span class="nc">LayerEvidence</span><span class="p">,</span><span class="w"> </span><span class="c1">// Exact { evidence, unitigs } | Approx { fingerprint }</span>
|
|
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p><code>MphfLayer::find(kmer)</code> dispatches transparently to <code>find_exact</code> or <code>find_approx</code> based on the evidence loaded at <code>open</code> time (read from <code>layer_meta.json</code>). Returns <code>Some(slot)</code> only if the kmer is confirmed present; <code>None</code> for absent or out-of-range.</p>
|
|
<div class="highlight"><pre><span></span><code>find_exact: slot = mphf(kmer); decode evidence → (chunk_id, rank); verify kmer in unitigs
|
|
find_approx: slot = mphf(kmer); check fingerprint[slot] == seq_hash(kmer)
|
|
</code></pre></div>
|
|
<p><code>block_bits</code> controls the <code>.idx</code> file written alongside <code>evidence.bin</code>. At <code>block_bits = 0</code>, every unitig chunk has an index entry, giving O(1) random access; larger values trade access time for a smaller <code>.idx</code>.</p>
|
|
<p>The MPHF and <code>unitigs.bin</code> are never rebuilt by any post-build operation.</p>
|
|
<hr />
|
|
<h2 id="layerd-mphf-data-payload">Layer\<D> — MPHF + data payload</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">Layer</span><span class="o"><</span><span class="n">D</span><span class="p">:</span><span class="w"> </span><span class="nc">LayerData</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="p">()</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="n">mphf</span><span class="p">:</span><span class="w"> </span><span class="nc">MphfLayer</span><span class="p">,</span>
|
|
<span class="w"> </span><span class="n">data</span><span class="p">:</span><span class="w"> </span><span class="nc">D</span><span class="p">,</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p><code>D</code> selects the attached data payload:</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th><code>D</code></th>
|
|
<th>Data directory</th>
|
|
<th><code>Item</code> returned by <code>query</code></th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>()</code></td>
|
|
<td>—</td>
|
|
<td><code>()</code> (set membership only)</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>PersistentCompactIntMatrix</code></td>
|
|
<td><code>counts/</code></td>
|
|
<td><code>Box<[u32]></code> (counts per genome)</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>PersistentBitMatrix</code></td>
|
|
<td><code>presence/</code></td>
|
|
<td><code>Box<[bool]></code> (presence per genome)</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<p><code>Layer::query(kmer)</code> delegates to <code>MphfLayer::find</code>, then calls <code>data.read(slot)</code> if a slot is returned. Both exact and approximate evidence are handled transparently; the caller sees only <code>Option<Hit<D::Item>></code>.</p>
|
|
<p>Build-time entry points:</p>
|
|
<div class="highlight"><pre><span></span><code><span class="n">Layer</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="p">::</span><span class="n">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">)</span><span class="w"> </span><span class="c1">// set membership</span>
|
|
<span class="n">Layer</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="p">::</span><span class="n">build</span><span class="p">(</span><span class="n">out_dir</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">,</span><span class="w"> </span><span class="n">count_of</span><span class="p">)</span>
|
|
<span class="n">Layer</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="p">::</span><span class="n">build_presence</span><span class="p">(</span><span class="n">out_dir</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">,</span><span class="w"> </span><span class="n">n_genomes</span><span class="p">,</span><span class="w"> </span><span class="n">present_in</span><span class="p">)</span>
|
|
|
|
<span class="n">Layer</span><span class="p">::</span><span class="o"><</span><span class="p">()</span><span class="o">></span><span class="p">::</span><span class="n">build_evidence</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">,</span><span class="w"> </span><span class="n">kind</span><span class="p">,</span><span class="w"> </span><span class="n">block_bits</span><span class="p">)</span><span class="w"> </span><span class="c1">// evidence only (reindex path)</span>
|
|
</code></pre></div>
|
|
<hr />
|
|
<h2 id="datastore-slot-indexed-data">DataStore — slot-indexed data</h2>
|
|
<p><code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code> are slot-indexed stores. They know nothing about kmers or MPHFs.</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th><code>Item</code></th>
|
|
<th>Aggregation method</th>
|
|
<th>Use</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>PersistentCompactIntMatrix</code></td>
|
|
<td><code>Box<[u32]></code></td>
|
|
<td><code>sum() → Array1<u64></code></td>
|
|
<td>counts per genome per slot</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>PersistentBitMatrix</code></td>
|
|
<td><code>Box<[bool]></code></td>
|
|
<td><code>count_ones() → Array1<u64></code></td>
|
|
<td>presence per genome per slot</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<hr />
|
|
<h2 id="aggregation-traits-obicompactvectraits">Aggregation traits — <code>obicompactvec::traits</code></h2>
|
|
<p>Three traits unify the aggregation API across all hierarchy levels.</p>
|
|
<div class="highlight"><pre><span></span><code><span class="k">trait</span><span class="w"> </span><span class="n">ColumnWeights</span><span class="p">:</span><span class="w"> </span><span class="nb">Send</span><span class="w"> </span><span class="o">+</span><span class="w"> </span><span class="nb">Sync</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">col_weights</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">trait</span><span class="w"> </span><span class="n">CountPartials</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_threshold_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">threshold</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">);</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hellinger</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="c1">// provided finalisation methods with default impls</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="w"> </span><span class="c1">// …</span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">trait</span><span class="w"> </span><span class="n">BitPartials</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">);</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hamming</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">;</span>
|
|
<span class="w"> </span><span class="c1">// provided</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">jaccard_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">hamming_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>Leaf implementors:</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Type</th>
|
|
<th>Traits</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>PersistentCompactIntMatrix</code></td>
|
|
<td><code>ColumnWeights</code>, <code>CountPartials</code></td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>PersistentBitMatrix</code></td>
|
|
<td><code>ColumnWeights</code>, <code>BitPartials</code></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<hr />
|
|
<h2 id="layeredstores-recursive-aggregation-wrapper">LayeredStore\<S> — recursive aggregation wrapper</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">struct</span><span class="w"> </span><span class="nc">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="p">(</span><span class="nb">Vec</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="p">);</span>
|
|
</code></pre></div>
|
|
<p>Three blanket impls propagate all traits up the hierarchy:</p>
|
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">ColumnWeights</span><span class="o">></span><span class="w"> </span><span class="n">ColumnWeights</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">CountPartials</span><span class="o">></span><span class="w"> </span><span class="n">CountPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
<span class="k">impl</span><span class="o"><</span><span class="n">S</span><span class="p">:</span><span class="w"> </span><span class="nc">BitPartials</span><span class="o">></span><span class="w"> </span><span class="n">BitPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">LayeredStore</span><span class="o"><</span><span class="n">S</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="err">…</span><span class="w"> </span><span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>This makes <code>LayeredStore<LayeredStore<PersistentCompactIntMatrix>></code> automatically implement <code>CountPartials</code> — no separate <code>PartitionedStore</code> type is needed:</p>
|
|
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix leaf (one layer)
|
|
LayeredStore<PersistentCompactIntMatrix> one partition (layers are disjoint)
|
|
LayeredStore<LayeredStore<…>> whole index (partitions are independent)
|
|
</code></pre></div>
|
|
<p>Normalised metrics require global column sums — computed in a two-pass cascade:</p>
|
|
<div class="highlight"><pre><span></span><code><span class="c1">// on LayeredStore<LayeredStore<PersistentCompactIntMatrix>></span>
|
|
<span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">global</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">col_weights</span><span class="p">();</span><span class="w"> </span><span class="c1">// pass 1 — sums up hierarchy</span>
|
|
<span class="w"> </span><span class="kd">let</span><span class="w"> </span><span class="n">p</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="n">global</span><span class="p">);</span><span class="w"> </span><span class="c1">// pass 2 — global broadcast read-only</span>
|
|
<span class="w"> </span><span class="n">p</span><span class="p">.</span><span class="n">mapv</span><span class="p">(</span><span class="o">|</span><span class="n">v</span><span class="o">|</span><span class="w"> </span><span class="mf">1.0</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">v</span><span class="p">)</span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>Because each kmer belongs to exactly one <code>(partition, layer)</code> pair, <code>col_weights()</code> has no double-counting across the hierarchy.</p>
|
|
<hr />
|
|
<h2 id="progressive-aggregation-principle">Progressive aggregation principle</h2>
|
|
<p>No level reaches two levels down. Each level sums contributions from the level immediately below:</p>
|
|
<div class="highlight"><pre><span></span><code>PersistentCompactIntMatrix::col_weights() — one (partition, layer)
|
|
↓ Σ across layers
|
|
LayeredStore<PersistentCompactIntMatrix>::col_weights() — one partition
|
|
↓ Σ across partitions
|
|
LayeredStore<LayeredStore<…>>::col_weights() — global
|
|
</code></pre></div>
|
|
<p>The same cascade applies to every partial method.</p>
|
|
<hr />
|
|
<h2 id="multi-genome-column-invariant">Multi-genome column invariant</h2>
|
|
<p>After any merge, every layer in every partition has exactly <code>n_genomes</code> columns, where <code>n_genomes</code> is the current total in <code>index.meta</code>. This holds for both <code>PersistentCompactIntMatrix</code> and <code>PersistentBitMatrix</code>.</p>
|
|
<p>Maintained by three coordinated operations:</p>
|
|
<p><strong>Existing layers — column append.</strong> <code>Layer::append_genome_column</code> appends one column to each existing layer. Slots matching the incoming genome receive its count or <code>true</code>; all other slots receive 0 or <code>false</code>.</p>
|
|
<p><strong>New layers — absent columns prepended.</strong> When a new layer is created for kmers unique to the incoming genome, <code>n_existing_genomes</code> absent columns are prepended before the incoming genome's column, so the new layer immediately has the same column count as all other layers.</p>
|
|
<p><strong>First merge, Presence mode — <code>init_presence_matrix</code>.</strong> The initial single-genome index has no <code>presence/</code> directory (presence is implicit). On the first merge, <code>Layer<()>::init_presence_matrix</code> materialises genome 0's presence column (all <code>true</code>) retroactively, raising the column count from 0 to 1 before appending column 1.</p>
|
|
<p>This invariant is the precondition for correct progressive aggregation: every level can blindly sum matrices from below because all matrices have the same shape.</p>
|
|
<hr />
|
|
<h2 id="query-model">Query model</h2>
|
|
<h3 id="point-query">Point query</h3>
|
|
<div class="highlight"><pre><span></span><code>minimiser(kmer) → partition p
|
|
for each layer l in p:
|
|
if let Some(slot) = MphfLayer_l.find(kmer):
|
|
return data_l.read(slot)
|
|
return None
|
|
</code></pre></div>
|
|
<p>O(n_layers) MPHF probes worst case; O(1) expected. The result comes from exactly one <code>(partition, layer)</code>.</p>
|
|
<h3 id="aggregation">Aggregation</h3>
|
|
<div class="highlight"><pre><span></span><code>result = reduce(
|
|
for p in partitions: // parallel
|
|
for l in layers(p): // parallel
|
|
partial(data_p_l)
|
|
)
|
|
</code></pre></div>
|
|
<p>For normalised metrics, replace with the two-pass cascade.</p>
|
|
<hr />
|
|
<h2 id="parallelism-model">Parallelism model</h2>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Level</th>
|
|
<th>Unit</th>
|
|
<th>Coordination</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>Across partitions</td>
|
|
<td>inner stores of <code>LayeredStore<LayeredStore<S>></code></td>
|
|
<td>none</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Across layers within a partition</td>
|
|
<td>inner stores of <code>LayeredStore<S></code></td>
|
|
<td>none — disjoint kmer sets</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Normalised pass 1 (<code>col_weights</code>)</td>
|
|
<td>per inner store</td>
|
|
<td>none — additive</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Normalised pass 2 (partial)</td>
|
|
<td>per inner store</td>
|
|
<td><code>global</code> broadcast read-only</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Within a matrix (distance)</td>
|
|
<td>upper-triangle pair <code>(i,j)</code></td>
|
|
<td>none — rayon <code>par_iter</code></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<hr />
|
|
<h2 id="reindex-evidence-conversion-in-place">reindex — evidence conversion in place</h2>
|
|
<p><code>KmerIndex::reindex(target, block_bits)</code> converts every layer's evidence bundle to <code>target</code> without touching the MPHF or <code>unitigs.bin</code>:</p>
|
|
<ul>
|
|
<li><code>→ Exact</code>: builds <code>evidence.bin</code> + <code>unitigs.bin.idx</code>; removes <code>fingerprint.bin</code></li>
|
|
<li><code>→ Approx { b, z }</code>: builds <code>fingerprint.bin</code>; removes <code>evidence.bin</code> + <code>unitigs.bin.idx</code></li>
|
|
</ul>
|
|
<p>On success, <code>IndexConfig::evidence</code> and <code>IndexConfig::block_bits</code> are updated in <code>index.meta</code>. Each layer's <code>layer_meta.json</code> is also rewritten with the new <code>EvidenceKind</code>.</p>
|
|
<hr />
|
|
<h2 id="estimate-parameter-dry-run">estimate — parameter dry-run</h2>
|
|
<p><code>estimate</code> resolves approximate-evidence parameters (<code>z</code>, <code>b</code>, target FP rate) and prints the resulting effective kmer size and per-kmer / per-z-window false-positive rates without touching any index. Used to calibrate <code>Approx { b, z }</code> before building or reindexing.</p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
|
|
|
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |