bb7adc1154
Expands MkDocs navigation and documentation for evidence elimination, the merge command, and kmer filtering. Refactors kmer representation to a generic `KmerOf<L>` type with a bitwise reverse complement algorithm. Unifies MPHF construction, introduces approximate fingerprint-based indexing, and updates the pipeline, chunkreader, and storage layouts. Adds code coverage reports and clarifies architectural invariants for improved maintainability.
1461 lines
31 KiB
HTML
1461 lines
31 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
<link rel="prev" href="../unitig_evidence/">
|
||
|
||
|
||
<link rel="next" href="../obilayeredmap/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>Evidence elimination (discussion) - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#approximate-evidence-fingerprint-based-index" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
Evidence elimination (discussion)
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../pipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../mphf/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Evidence elimination (discussion)
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Evidence elimination (discussion)
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#motivation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Motivation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#the-findere-model" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
The Findere model
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#fingerprintvec-on-disk" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
FingerprintVec on disk
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#evidencekind-and-metadata" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
EvidenceKind and metadata
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#parameter-resolution-resolve_approx_params" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Parameter resolution (resolve_approx_params)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#cli-flags" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
CLI flags
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reindex-command" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
reindex command
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#estimate-command" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
estimate command
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obilayeredmap/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obilayeredmap crate
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_compact_int_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_bit_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentBitVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../merge/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Merge command
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../rebuild_filter/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer filtering (rebuild/dump/unitig)
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/index_architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer index
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#motivation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Motivation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#the-findere-model" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
The Findere model
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#fingerprintvec-on-disk" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
FingerprintVec on disk
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#evidencekind-and-metadata" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
EvidenceKind and metadata
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#parameter-resolution-resolve_approx_params" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Parameter resolution (resolve_approx_params)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#cli-flags" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
CLI flags
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reindex-command" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
reindex command
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#estimate-command" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
estimate command
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="approximate-evidence-fingerprint-based-index">Approximate evidence: fingerprint-based index</h1>
|
||
<h2 id="motivation">Motivation</h2>
|
||
<p><code>evidence.bin</code> maps each MPHF slot to the position of the k-mer that owns it,
|
||
enabling zero-FP verification. On the bacterial BCT dataset (2048 partitions,
|
||
k=31, ~33 M k-mers/partition) it accounts for 66 % of the lookup-layer footprint:</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>file</th>
|
||
<th>size/partition</th>
|
||
<th>fraction</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td>evidence.bin</td>
|
||
<td>132 MB</td>
|
||
<td>66 %</td>
|
||
</tr>
|
||
<tr>
|
||
<td>unitigs.bin</td>
|
||
<td>58 MB</td>
|
||
<td>29 %</td>
|
||
</tr>
|
||
<tr>
|
||
<td>mphf.bin</td>
|
||
<td>10 MB</td>
|
||
<td>5 %</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><code>evidence.bin</code> is a bijection from MPHF-space to unitig-position-space and
|
||
costs at minimum ⌈log₂ N⌉ bits per slot — an information-theoretic floor with
|
||
only ~22 % packing headroom. Compression is not a path to elimination.</p>
|
||
<p>The approximate index replaces <code>evidence.bin</code> + <code>unitigs.bin.idx</code> with a
|
||
<code>fingerprint.bin</code> file. The MPHF and <code>unitigs.bin</code> are kept unchanged. Set
|
||
operations still require an exact index; the approximate index targets query
|
||
workloads that can tolerate a bounded false-positive rate.</p>
|
||
<hr />
|
||
<h2 id="the-findere-model">The Findere model</h2>
|
||
<p>A B-bit fingerprint stored per MPHF slot provides the discrimination that
|
||
<code>evidence.bin</code> would otherwise provide through full k-mer reconstruction.</p>
|
||
<p>For a foreign k-mer query, the MPHF maps it to some slot <code>s</code>. The fingerprint
|
||
stored at <code>s</code> belongs to the legitimate k-mer at that slot. The FP event is:</p>
|
||
<div class="highlight"><pre><span></span><code>P(FP per k-mer) = 1 / 2^b
|
||
</code></pre></div>
|
||
<p>The Findere trick reduces the indexed k-mer size. When the user specifies k_user
|
||
and z, the index physically stores k-mers of size <code>s = k_user − z + 1</code>. At query
|
||
time, the same s-mer size is used. After collecting per-position s-mer results
|
||
over the full query sequence, a sliding window of size z aggregates z consecutive
|
||
s-mer hits into one confirmed k_user-mer hit, reducing the per-window FP rate:</p>
|
||
<div class="highlight"><pre><span></span><code>P(FP per k_user-mer) = 1 / 2^(b·z)
|
||
</code></pre></div>
|
||
<p><code>IndexConfig::kmer_size</code> stores <code>s = k_user − z + 1</code>, not k_user. Both indexing
|
||
and querying use this stored size via <code>set_k(idx.kmer_size())</code>.</p>
|
||
<p>Parameters b and z are stored in <code>layer_meta.json</code> (<code>EvidenceKind::Approx { b, z }</code>).</p>
|
||
<hr />
|
||
<h2 id="fingerprintvec-on-disk"><code>FingerprintVec</code> on disk</h2>
|
||
<p><code>fingerprint.bin</code> layout:</p>
|
||
<div class="highlight"><pre><span></span><code>magic: b"FPVF" (4 bytes)
|
||
b: u8 (bits per slot, 1..=64)
|
||
padding: [0u8; 3]
|
||
n: u64 LE (number of slots)
|
||
data: packed bits, ceil(n·b/8) bytes, Lsb0 order
|
||
</code></pre></div>
|
||
<p><code>FingerprintVec</code> is memory-mapped. The match check against a query k-mer:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">matches</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">fingerprint</span><span class="p">:</span><span class="w"> </span><span class="kt">u64</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">get</span><span class="p">(</span><span class="n">slot</span><span class="p">)</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="p">(</span><span class="n">fingerprint</span><span class="w"> </span><span class="o">&</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">mask</span><span class="p">)</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><code>build_approx_evidence</code> iterates <code>unitigs.bin</code> sequentially, writes
|
||
<code>kmer.seq_hash()</code> into the slot assigned by the MPHF, then saves <code>fingerprint.bin</code>
|
||
and <code>layer_meta.json</code>. No <code>.idx</code> file is produced; random access into
|
||
<code>unitigs.bin</code> is not needed.</p>
|
||
<p>At build time, <code>find_approx</code> in <code>MphfLayer</code>:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">slot</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">mphf</span><span class="p">.</span><span class="n">index</span><span class="p">(</span><span class="o">&</span><span class="n">kmer</span><span class="p">.</span><span class="n">raw</span><span class="p">());</span>
|
||
<span class="k">if</span><span class="w"> </span><span class="n">fingerprint</span><span class="p">.</span><span class="n">matches</span><span class="p">(</span><span class="n">slot</span><span class="p">,</span><span class="w"> </span><span class="n">kmer</span><span class="p">.</span><span class="n">seq_hash</span><span class="p">())</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nb">Some</span><span class="p">(</span><span class="n">slot</span><span class="p">)</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="nb">None</span><span class="w"> </span><span class="p">}</span>
|
||
</code></pre></div>
|
||
<hr />
|
||
<h2 id="evidencekind-and-metadata"><code>EvidenceKind</code> and metadata</h2>
|
||
<p><code>layer_meta.json</code> records which evidence bundle is present:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">EvidenceKind</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">Exact</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">b</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="p">,</span><span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">u8</span><span class="w"> </span><span class="p">},</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><code>MphfLayer::open</code> reads this tag and dispatches <code>find</code> to <code>find_exact</code> or
|
||
<code>find_approx</code> transparently. <code>find_exact</code> panics on an approximate layer;
|
||
<code>find_approx</code> panics on an exact layer — mode mixing is a programming error.</p>
|
||
<hr />
|
||
<h2 id="parameter-resolution-resolve_approx_params">Parameter resolution (<code>resolve_approx_params</code>)</h2>
|
||
<p>The identity <code>b·z = ⌈−log₂(fp)⌉</code> lets any two of (b, z, fp) derive the third.
|
||
<code>resolve_approx_params</code> implements a 2-of-3 rule with conservative ceiling
|
||
rounding:</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>given</th>
|
||
<th>derived</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td>b, z</td>
|
||
<td>fp = 1/2^(b·z)</td>
|
||
</tr>
|
||
<tr>
|
||
<td>z, fp</td>
|
||
<td>b = ⌈−log₂(fp) / z⌉</td>
|
||
</tr>
|
||
<tr>
|
||
<td>b, fp</td>
|
||
<td>z = ⌈−log₂(fp) / b⌉</td>
|
||
</tr>
|
||
<tr>
|
||
<td>z only</td>
|
||
<td>b = 8 (default), fp derived</td>
|
||
</tr>
|
||
<tr>
|
||
<td>b only</td>
|
||
<td>z = 1 (default), fp derived</td>
|
||
</tr>
|
||
<tr>
|
||
<td>fp only</td>
|
||
<td>b = 8 (default), z derived</td>
|
||
</tr>
|
||
<tr>
|
||
<td>none</td>
|
||
<td>b = 8, z = 1, fp = 1/256</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>When all three are given, b and z are authoritative and fp is recomputed.</p>
|
||
<hr />
|
||
<h2 id="cli-flags">CLI flags</h2>
|
||
<p>Both <code>index</code> and <code>reindex</code> accept the same flags:</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>flag</th>
|
||
<th>type</th>
|
||
<th>meaning</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>--approx</code></td>
|
||
<td>bool</td>
|
||
<td>enable fingerprint evidence</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--evidence-bits</code> (<code>b</code>)</td>
|
||
<td>u8</td>
|
||
<td>fingerprint bits per slot</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>-z</code></td>
|
||
<td>u8</td>
|
||
<td>Findere z parameter</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--fp</code></td>
|
||
<td>f64</td>
|
||
<td>target FP rate per z-window</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--block-size</code></td>
|
||
<td>usize</td>
|
||
<td>unitig block size for exact <code>.idx</code>; ignored in approx mode</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><code>--approx</code> must be set explicitly; the other three flags are optional and
|
||
resolved by the 2-of-3 rule. Omitting all three produces b=8, z=1.</p>
|
||
<hr />
|
||
<h2 id="reindex-command"><code>reindex</code> command</h2>
|
||
<p><code>reindex</code> converts an existing index between exact and approximate evidence
|
||
in-place across all partitions and layers, running partitions in parallel via
|
||
Rayon.</p>
|
||
<p>Conversion to approximate (<code>--approx</code>):</p>
|
||
<ul>
|
||
<li>Builds <code>fingerprint.bin</code> from <code>unitigs.bin</code> + <code>mphf.bin</code>.</li>
|
||
<li>Removes <code>evidence.bin</code> and <code>unitigs.bin.idx</code>.</li>
|
||
<li>Updates <code>layer_meta.json</code> with <code>EvidenceKind::Approx { b, z }</code>.</li>
|
||
</ul>
|
||
<p>Conversion to exact (default, no <code>--approx</code>):</p>
|
||
<ul>
|
||
<li>Builds <code>evidence.bin</code> + <code>unitigs.bin.idx</code> from <code>unitigs.bin</code> + <code>mphf.bin</code>.</li>
|
||
<li>Removes <code>fingerprint.bin</code>.</li>
|
||
<li>Updates <code>layer_meta.json</code> with <code>EvidenceKind::Exact</code>.</li>
|
||
</ul>
|
||
<p>The root <code>index.meta</code> is updated with the new evidence kind on success.
|
||
<code>mphf.bin</code> and <code>unitigs.bin</code> are never modified.</p>
|
||
<hr />
|
||
<h2 id="estimate-command"><code>estimate</code> command</h2>
|
||
<p><code>estimate</code> is a dry-run that resolves and prints (b, z, fp) without touching
|
||
any index. It accepts the same <code>--evidence-bits</code>, <code>-z</code>, and <code>--fp</code> flags and
|
||
additionally accepts <code>-k</code> to display the effective indexed k-mer length:</p>
|
||
<div class="highlight"><pre><span></span><code>k (user): 31
|
||
k (indexed, s=k-z+1): 27
|
||
z: 5
|
||
evidence bits (b): 8
|
||
FP per s-mer: 3.906e-3 (1/2^8)
|
||
FP per k-mer window: 9.537e-7 (1/2^(8·5))
|
||
</code></pre></div>
|
||
<p>Useful for choosing parameters before committing to an index build.</p>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |