bb7adc1154
Expands MkDocs navigation and documentation for evidence elimination, the merge command, and kmer filtering. Refactors kmer representation to a generic `KmerOf<L>` type with a bitwise reverse complement algorithm. Unifies MPHF construction, introduces approximate fingerprint-based indexing, and updates the pipeline, chunkreader, and storage layouts. Adds code coverage reports and clarifies architectural invariants for improved maintainability.
1577 lines
36 KiB
HTML
1577 lines
36 KiB
HTML
|
|
<!doctype html>
|
|
<html lang="en" class="no-js">
|
|
<head>
|
|
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width,initial-scale=1">
|
|
|
|
|
|
|
|
|
|
<link rel="prev" href="../persistent_bit_vec/">
|
|
|
|
|
|
<link rel="next" href="../rebuild_filter/">
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="icon" href="../../assets/images/favicon.png">
|
|
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
|
|
|
|
|
|
|
<title>Merge command - obikmer</title>
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
|
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
|
|
|
|
|
|
|
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
|
|
|
|
|
|
|
|
|
|
|
</head>
|
|
|
|
|
|
<body dir="ltr">
|
|
|
|
|
|
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
|
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
|
<label class="md-overlay" for="__drawer"></label>
|
|
<div data-md-component="skip">
|
|
|
|
|
|
<a href="#merge-command" class="md-skip">
|
|
Skip to content
|
|
</a>
|
|
|
|
</div>
|
|
<div data-md-component="announce">
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<header class="md-header md-header--shadow" data-md-component="header">
|
|
<nav class="md-header__inner md-grid" aria-label="Header">
|
|
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
<label class="md-header__button md-icon" for="__drawer">
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
|
</label>
|
|
<div class="md-header__title" data-md-component="header-title">
|
|
<div class="md-header__ellipsis">
|
|
<div class="md-header__topic">
|
|
<span class="md-ellipsis">
|
|
obikmer
|
|
</span>
|
|
</div>
|
|
<div class="md-header__topic" data-md-component="header-topic">
|
|
<span class="md-ellipsis">
|
|
|
|
Merge command
|
|
|
|
</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
|
|
|
|
|
|
|
|
|
</nav>
|
|
|
|
</header>
|
|
|
|
<div class="md-container" data-md-component="container">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<main class="md-main" data-md-component="main">
|
|
<div class="md-main__inner md-grid">
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
|
<label class="md-nav__title" for="__drawer">
|
|
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
|
|
|
|
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
|
|
|
</a>
|
|
obikmer
|
|
</label>
|
|
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../.." class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Home
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Theory
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_2">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Theory
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../kmers/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmers and super-kmers
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/encoding/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
DNA encoding
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/entropy/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Entropy filter
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/minimizer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Minimizer selection
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../theory/indexing/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Partitioning architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Implementation
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
|
<label class="md-nav__title" for="__nav_3">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Implementation
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../superkmer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
SuperKmer
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../kmer/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../chunkreader/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Chunk reader
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../pipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Construction pipeline
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../obipipeline/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
obipipeline library
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../storage/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
On-disk storage
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../mphf/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
MPHF selection
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../unitig_evidence/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Unitig evidence encoding
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../evidence_elimination/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Evidence elimination (discussion)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../obilayeredmap/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
obilayeredmap crate
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../persistent_compact_int_vec/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
PersistentCompactIntVec
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../persistent_bit_vec/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
PersistentBitVec
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--active">
|
|
|
|
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__link md-nav__link--active" for="__toc">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Merge command
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<a href="./" class="md-nav__link md-nav__link--active">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Merge command
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#purpose" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Purpose
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#modes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Modes
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#input-output-constraints" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Input / output constraints
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#evidence-compatibility" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Evidence compatibility
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#genome-label-deduplication" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Genome label deduplication
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#algorithm" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Algorithm
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Algorithm">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#1-validation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
1. Validation
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#2-bootstrap-output-from-first-source" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
2. Bootstrap output from first source
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#3-for-each-subsequent-source-parallel-across-partitions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
3. For each subsequent source (parallel across partitions)
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#4-update-index-metadata" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
4. Update index metadata
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#append_genome_column" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
append_genome_column
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#column-count-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Column count invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#error-variants-relevant-to-merge" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Error variants relevant to merge
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#on-disk-impact" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
On-disk impact
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../rebuild_filter/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer filtering (rebuild/dump/unitig)
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item md-nav__item--nested">
|
|
|
|
|
|
|
|
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
|
|
|
|
|
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Architecture
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
<span class="md-nav__icon md-icon"></span>
|
|
</label>
|
|
|
|
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
|
<label class="md-nav__title" for="__nav_4">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
|
|
|
|
Architecture
|
|
|
|
|
|
</label>
|
|
<ul class="md-nav__list" data-md-scrollfix>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Sequences
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<li class="md-nav__item">
|
|
<a href="../../architecture/index_architecture/" class="md-nav__link">
|
|
|
|
|
|
|
|
<span class="md-ellipsis">
|
|
|
|
|
|
Kmer index
|
|
|
|
|
|
|
|
</span>
|
|
|
|
|
|
|
|
</a>
|
|
</li>
|
|
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
|
|
|
|
</ul>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
|
<div class="md-sidebar__scrollwrap">
|
|
<div class="md-sidebar__inner">
|
|
|
|
|
|
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<label class="md-nav__title" for="__toc">
|
|
<span class="md-nav__icon md-icon"></span>
|
|
Table of contents
|
|
</label>
|
|
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#purpose" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Purpose
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#modes" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Modes
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#input-output-constraints" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Input / output constraints
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#evidence-compatibility" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Evidence compatibility
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#genome-label-deduplication" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Genome label deduplication
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#algorithm" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Algorithm
|
|
|
|
</span>
|
|
</a>
|
|
|
|
<nav class="md-nav" aria-label="Algorithm">
|
|
<ul class="md-nav__list">
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#1-validation" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
1. Validation
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#2-bootstrap-output-from-first-source" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
2. Bootstrap output from first source
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#3-for-each-subsequent-source-parallel-across-partitions" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
3. For each subsequent source (parallel across partitions)
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#4-update-index-metadata" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
4. Update index metadata
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
</nav>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#append_genome_column" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
append_genome_column
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#column-count-invariant" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Column count invariant
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#error-variants-relevant-to-merge" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
Error variants relevant to merge
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
<li class="md-nav__item">
|
|
<a href="#on-disk-impact" class="md-nav__link">
|
|
<span class="md-ellipsis">
|
|
|
|
On-disk impact
|
|
|
|
</span>
|
|
</a>
|
|
|
|
</li>
|
|
|
|
</ul>
|
|
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<div class="md-content" data-md-component="content">
|
|
|
|
<article class="md-content__inner md-typeset">
|
|
|
|
|
|
|
|
|
|
|
|
<h1 id="merge-command">Merge command</h1>
|
|
<h2 id="purpose">Purpose</h2>
|
|
<p><code>obikmer merge</code> combines multiple existing kmer indexes into a single index. The result contains all kmers from all sources, with per-genome presence/absence or count data for every genome across every layer.</p>
|
|
<hr />
|
|
<h2 id="modes">Modes</h2>
|
|
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">MergeMode</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">Presence</span><span class="p">,</span><span class="w"> </span><span class="n">Count</span><span class="w"> </span><span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>Default mode is <code>Presence</code>. <code>Count</code> mode requires <strong>all</strong> source indexes to have <code>with_counts=true</code>; mixing count and non-count sources is rejected at validation.</p>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Mode</th>
|
|
<th>Column type</th>
|
|
<th>Constraint</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>Presence</code></td>
|
|
<td><code>PersistentBitMatrix</code> (one bit per genome per slot)</td>
|
|
<td>none</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>Count</code></td>
|
|
<td><code>PersistentCompactIntMatrix</code> (one u32 per genome per slot)</td>
|
|
<td>all sources <code>with_counts=true</code></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<hr />
|
|
<h2 id="input-output-constraints">Input / output constraints</h2>
|
|
<p>All source indexes must satisfy:</p>
|
|
<ul>
|
|
<li><code>IndexState::Indexed</code> (fully built — <code>index.done</code> sentinel present)</li>
|
|
<li>Same <code>kmer_size</code>, <code>minimizer_size</code>, <code>n_partitions</code></li>
|
|
<li>Same evidence kind: all <code>Exact</code>, or all <code>Approx</code> with identical <code>(b, z)</code> parameters</li>
|
|
<li>If <code>Count</code> mode: all sources must have <code>with_counts=true</code></li>
|
|
</ul>
|
|
<p><code>--force</code>: if the output directory already exists, it is deleted before the merge begins.</p>
|
|
<hr />
|
|
<h2 id="evidence-compatibility">Evidence compatibility</h2>
|
|
<p><code>validate_evidence_compat(sources)</code> is called before any I/O. It compares each source's <code>EvidenceKind</code> against <code>sources[0]</code>:</p>
|
|
<ul>
|
|
<li>All <code>Exact</code> → accepted, output uses <code>Exact</code></li>
|
|
<li>All <code>Approx { b, z }</code> with same <code>(b, z)</code> → accepted, output uses those parameters</li>
|
|
<li>Any other combination → <code>OKIError::IncompatibleEvidence</code>, with a message directing the user to run <code>reindex</code> first</li>
|
|
</ul>
|
|
<p>Mixed exact/approx is a hard error, not a silent conversion.</p>
|
|
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">validate_evidence_compat</span><span class="p">(</span><span class="n">sources</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="p">[</span><span class="o">&</span><span class="n">KmerIndex</span><span class="p">])</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OKIResult</span><span class="o"><</span><span class="n">EvidenceKind</span><span class="o">></span>
|
|
</code></pre></div>
|
|
<hr />
|
|
<h2 id="genome-label-deduplication">Genome label deduplication</h2>
|
|
<p><code>compute_labels(sources, rename_duplicates)</code> assigns final genome labels across all sources before any file is written. The first occurrence of a label keeps the original name. Subsequent occurrences receive <code>.1</code>, <code>.2</code>, … suffixes when <code>rename_duplicates</code> is true, or trigger <code>OKIError::DuplicateGenomeLabel</code> otherwise.</p>
|
|
<hr />
|
|
<h2 id="algorithm">Algorithm</h2>
|
|
<h3 id="1-validation">1. Validation</h3>
|
|
<p>Check all sources against the constraints above. Abort on any mismatch.</p>
|
|
<h3 id="2-bootstrap-output-from-first-source">2. Bootstrap output from first source</h3>
|
|
<p>Recursive file copy of <code>sources[0]</code> → <code>output</code>. Immediately after the copy:</p>
|
|
<ul>
|
|
<li><code>index.meta</code> is rewritten with the final genome list (all sources, possibly renamed) and the effective evidence kind.</li>
|
|
<li>In <code>Presence</code> mode, any <code>counts/</code> directories inherited from source_0 are removed.</li>
|
|
<li><code>spectrums/</code> from source_0 is removed and rebuilt from scratch across all sources, applying the (possibly renamed) labels.</li>
|
|
</ul>
|
|
<p>This establishes the partition layout, all existing MPHFs, unitigs, and evidence files. The first source's genomes occupy columns 0 … <code>n_dst_genomes - 1</code> in the destination.</p>
|
|
<h3 id="3-for-each-subsequent-source-parallel-across-partitions">3. For each subsequent source (parallel across partitions)</h3>
|
|
<p><code>KmerPartition::merge_partition(i, sources, mode, n_dst_genomes, block_bits)</code> is called for each partition index <code>i</code>. <code>block_bits</code> is taken from <code>dst.meta.config.block_bits</code>.</p>
|
|
<p>Each entry in <code>sources</code> is <code>(&KmerPartition, n_genomes)</code> where <code>n_genomes</code> is the column count that source contributes (> 1 when the source is itself a merged index).</p>
|
|
<p><strong>First merge, Presence mode</strong>: when <code>n_dst_genomes == 1</code>, <code>Layer::<()>::init_presence_matrix</code> is called on every existing destination layer before any source column is appended. This creates <code>presence/col_000000.pbiv</code> set all-true (genome 0 is present in every slot).</p>
|
|
<p><strong>Pass 1 — classify kmers</strong></p>
|
|
<p>Iterate all kmers from all source partitions (via <code>UnitigFileReader</code> + canonical kmer iteration). For each kmer, probe the destination <code>LayeredMap<()></code>:</p>
|
|
<ul>
|
|
<li><strong>Hit</strong>: kmer already in the destination; record for Pass 2.</li>
|
|
<li><strong>Miss</strong>: push kmer into a <code>GraphDeBruijn</code> accumulator.</li>
|
|
</ul>
|
|
<p><strong>New layer construction</strong></p>
|
|
<p>If the accumulator is non-empty, compute de Bruijn unitigs and call <code>Layer::<()>::build(&new_layer_dir, block_bits)</code>. All kmers absent from the destination — across <strong>all</strong> sources — accumulate into a <strong>single</strong> graph, producing one new layer per merge operation (not one per source).</p>
|
|
<p><strong>Pass 2 — fill column builders</strong></p>
|
|
<p>For each source and each of its layers, re-iterate unitigs and look up stored values via <code>SrcLayerData::lookup(kmer, src_n)</code>:</p>
|
|
<ul>
|
|
<li><code>SrcLayerData::SetMembership</code> — no data directory exists; every kmer returns <code>vec![1; n_genomes]</code></li>
|
|
<li><code>SrcLayerData::Presence</code> — reads <code>PersistentBitMatrix</code> from <code>presence/</code></li>
|
|
<li><code>SrcLayerData::Count</code> — reads <code>PersistentCompactIntMatrix</code> from <code>counts/</code></li>
|
|
</ul>
|
|
<p>Hits are routed to <code>exist_builders[dst_layer][src_col]</code>; misses are routed to <code>new_src_builders[src_col]</code>.</p>
|
|
<p><strong>Column prepending for new layers</strong></p>
|
|
<p>Before source columns are written to the new layer, <code>n_dst_genomes</code> absent columns (all-zero / all-false) are prepended — one per genome already in the index — so the column count invariant holds immediately after layer creation.</p>
|
|
<p><strong>Close and update metadata</strong></p>
|
|
<p>Close all builders; update <code>presence/meta.json</code> or <code>counts/meta.json</code> with <code>{"n": N, "n_cols": n_dst_genomes + n_src_total}</code>; increment <code>PartitionMeta::n_layers</code> if a new layer was added.</p>
|
|
<h3 id="4-update-index-metadata">4. Update index metadata</h3>
|
|
<p><code>index.meta</code> was already written during bootstrap with the complete genome list and evidence kind. No further update is needed after the partition loop.</p>
|
|
<hr />
|
|
<h2 id="append_genome_column"><code>append_genome_column</code></h2>
|
|
<p>Defined on two concrete specialisations of <code>Layer<D></code>:</p>
|
|
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentCompactIntMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span>
|
|
<span class="p">}</span>
|
|
|
|
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o"><</span><span class="n">PersistentBitMatrix</span><span class="o">></span><span class="w"> </span><span class="p">{</span>
|
|
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">bool</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="p">()</span><span class="o">></span>
|
|
<span class="p">}</span>
|
|
</code></pre></div>
|
|
<p>Each appends one column file to the matrix subdirectory (<code>counts/</code> or <code>presence/</code>). In <code>merge_partition</code>, columns are written directly via <code>PersistentBitVecBuilder</code> / <code>PersistentCompactIntVecBuilder</code> rather than through these helpers, but the invariant they enforce is the same.</p>
|
|
<hr />
|
|
<h2 id="column-count-invariant">Column count invariant</h2>
|
|
<p>After any merge, <strong>every layer in every partition has exactly <code>n_genomes</code> columns</strong>, where <code>n_genomes</code> is the total genome count in the index at that point.</p>
|
|
<p>Maintained by three mechanisms:</p>
|
|
<ol>
|
|
<li><strong>Existing layers</strong>: <code>n_src_total</code> columns appended (one per source genome).</li>
|
|
<li><strong>New layers created during merge</strong>: <code>n_dst_genomes</code> absent columns prepended before source columns.</li>
|
|
<li><strong>First merge, Presence mode</strong>: <code>init_presence_matrix</code> retroactively creates <code>presence/col_0</code> all-true for genome 0.</li>
|
|
</ol>
|
|
<p>The invariant is a precondition of <code>LayeredStore</code> aggregation traits: <code>col_weights()</code> and all partial distance methods assume every inner store has the same column count.</p>
|
|
<hr />
|
|
<h2 id="error-variants-relevant-to-merge">Error variants relevant to merge</h2>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Variant</th>
|
|
<th>Condition</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td><code>OKIError::NotIndexed(path)</code></td>
|
|
<td>Source not in <code>Indexed</code> state</td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>OKIError::IncompatibleConfig</code></td>
|
|
<td>Mismatched <code>kmer_size</code>, <code>minimizer_size</code>, or <code>n_partitions</code></td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>OKIError::MismatchedMode</code></td>
|
|
<td>Count mode but a source has <code>with_counts=false</code></td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>OKIError::IncompatibleEvidence(msg)</code></td>
|
|
<td>Mixed exact/approx or different approx <code>(b, z)</code></td>
|
|
</tr>
|
|
<tr>
|
|
<td><code>OKIError::DuplicateGenomeLabel(label)</code></td>
|
|
<td>Duplicate label and <code>rename_duplicates=false</code></td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
<hr />
|
|
<h2 id="on-disk-impact">On-disk impact</h2>
|
|
<p>After merging <code>G</code> genomes (sources_0 contributes <code>G0</code>, subsequent sources the rest):</p>
|
|
<div class="highlight"><pre><span></span><code>partitions/
|
|
part_00000/
|
|
index/
|
|
meta.json ← n_layers updated if new layer added
|
|
layer_0/
|
|
mphf.bin ← unchanged
|
|
unitigs.bin ← unchanged
|
|
evidence.bin ← unchanged
|
|
presence/ ← created on first merge (Presence mode)
|
|
meta.json {"n": N, "n_cols": G}
|
|
col_000000.pbiv ← all-true (genome 0 … G0-1)
|
|
col_000001.pbiv ← next source
|
|
...
|
|
counts/ ← extended (Count mode)
|
|
meta.json {"n": N, "n_cols": G}
|
|
col_000000.pciv ← genome 0 counts (from original build)
|
|
col_000001.pciv ← next source
|
|
...
|
|
layer_N/ ← new layer (if new kmers found)
|
|
mphf.bin
|
|
unitigs.bin
|
|
evidence.bin
|
|
presence/ or counts/
|
|
meta.json {"n": N1, "n_cols": G}
|
|
col_000000.pbiv ← all-false (absent for existing genomes)
|
|
...
|
|
spectrums/
|
|
<label>.json ← one file per genome, rebuilt from all sources
|
|
index.meta ← complete genome list + evidence kind written at bootstrap
|
|
</code></pre></div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
</article>
|
|
</div>
|
|
|
|
|
|
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
|
</div>
|
|
|
|
</main>
|
|
|
|
<footer class="md-footer">
|
|
|
|
<div class="md-footer-meta md-typeset">
|
|
<div class="md-footer-meta__inner md-grid">
|
|
<div class="md-copyright">
|
|
|
|
|
|
Made with
|
|
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
|
Material for MkDocs
|
|
</a>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
</div>
|
|
<div class="md-dialog" data-md-component="dialog">
|
|
<div class="md-dialog__inner md-typeset"></div>
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
|
|
|
|
|
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
|
|
|
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
|
|
|
|
|
</body>
|
|
</html> |