Files
obikmer/doc/implementation/merge/index.html
T
Eric Coissac bb7adc1154 docs: expand kmer indexing, filtering, and merging documentation
Expands MkDocs navigation and documentation for evidence elimination, the merge command, and kmer filtering. Refactors kmer representation to a generic `KmerOf<L>` type with a bitwise reverse complement algorithm. Unifies MPHF construction, introduces approximate fingerprint-based indexing, and updates the pipeline, chunkreader, and storage layouts. Adds code coverage reports and clarifies architectural invariants for improved maintainability.
2026-06-04 22:59:41 +02:00

1577 lines
36 KiB
HTML

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../persistent_bit_vec/">
<link rel="next" href="../rebuild_filter/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Merge command - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#merge-command" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Merge command
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/minimizer/" class="md-nav__link">
<span class="md-ellipsis">
Minimizer selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obipipeline/" class="md-nav__link">
<span class="md-ellipsis">
obipipeline library
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../unitig_evidence/" class="md-nav__link">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../evidence_elimination/" class="md-nav__link">
<span class="md-ellipsis">
Evidence elimination (discussion)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Merge command
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Merge command
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#purpose" class="md-nav__link">
<span class="md-ellipsis">
Purpose
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#modes" class="md-nav__link">
<span class="md-ellipsis">
Modes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#input-output-constraints" class="md-nav__link">
<span class="md-ellipsis">
Input / output constraints
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidence-compatibility" class="md-nav__link">
<span class="md-ellipsis">
Evidence compatibility
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#genome-label-deduplication" class="md-nav__link">
<span class="md-ellipsis">
Genome label deduplication
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#algorithm" class="md-nav__link">
<span class="md-ellipsis">
Algorithm
</span>
</a>
<nav class="md-nav" aria-label="Algorithm">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#1-validation" class="md-nav__link">
<span class="md-ellipsis">
1. Validation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#2-bootstrap-output-from-first-source" class="md-nav__link">
<span class="md-ellipsis">
2. Bootstrap output from first source
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#3-for-each-subsequent-source-parallel-across-partitions" class="md-nav__link">
<span class="md-ellipsis">
3. For each subsequent source (parallel across partitions)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#4-update-index-metadata" class="md-nav__link">
<span class="md-ellipsis">
4. Update index metadata
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#append_genome_column" class="md-nav__link">
<span class="md-ellipsis">
append_genome_column
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#column-count-invariant" class="md-nav__link">
<span class="md-ellipsis">
Column count invariant
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#error-variants-relevant-to-merge" class="md-nav__link">
<span class="md-ellipsis">
Error variants relevant to merge
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#on-disk-impact" class="md-nav__link">
<span class="md-ellipsis">
On-disk impact
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../rebuild_filter/" class="md-nav__link">
<span class="md-ellipsis">
Kmer filtering (rebuild/dump/unitig)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#purpose" class="md-nav__link">
<span class="md-ellipsis">
Purpose
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#modes" class="md-nav__link">
<span class="md-ellipsis">
Modes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#input-output-constraints" class="md-nav__link">
<span class="md-ellipsis">
Input / output constraints
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidence-compatibility" class="md-nav__link">
<span class="md-ellipsis">
Evidence compatibility
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#genome-label-deduplication" class="md-nav__link">
<span class="md-ellipsis">
Genome label deduplication
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#algorithm" class="md-nav__link">
<span class="md-ellipsis">
Algorithm
</span>
</a>
<nav class="md-nav" aria-label="Algorithm">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#1-validation" class="md-nav__link">
<span class="md-ellipsis">
1. Validation
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#2-bootstrap-output-from-first-source" class="md-nav__link">
<span class="md-ellipsis">
2. Bootstrap output from first source
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#3-for-each-subsequent-source-parallel-across-partitions" class="md-nav__link">
<span class="md-ellipsis">
3. For each subsequent source (parallel across partitions)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#4-update-index-metadata" class="md-nav__link">
<span class="md-ellipsis">
4. Update index metadata
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#append_genome_column" class="md-nav__link">
<span class="md-ellipsis">
append_genome_column
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#column-count-invariant" class="md-nav__link">
<span class="md-ellipsis">
Column count invariant
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#error-variants-relevant-to-merge" class="md-nav__link">
<span class="md-ellipsis">
Error variants relevant to merge
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#on-disk-impact" class="md-nav__link">
<span class="md-ellipsis">
On-disk impact
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="merge-command">Merge command</h1>
<h2 id="purpose">Purpose</h2>
<p><code>obikmer merge</code> combines multiple existing kmer indexes into a single index. The result contains all kmers from all sources, with per-genome presence/absence or count data for every genome across every layer.</p>
<hr />
<h2 id="modes">Modes</h2>
<div class="highlight"><pre><span></span><code><span class="k">pub</span><span class="w"> </span><span class="k">enum</span><span class="w"> </span><span class="nc">MergeMode</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">Presence</span><span class="p">,</span><span class="w"> </span><span class="n">Count</span><span class="w"> </span><span class="p">}</span>
</code></pre></div>
<p>Default mode is <code>Presence</code>. <code>Count</code> mode requires <strong>all</strong> source indexes to have <code>with_counts=true</code>; mixing count and non-count sources is rejected at validation.</p>
<table>
<thead>
<tr>
<th>Mode</th>
<th>Column type</th>
<th>Constraint</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>Presence</code></td>
<td><code>PersistentBitMatrix</code> (one bit per genome per slot)</td>
<td>none</td>
</tr>
<tr>
<td><code>Count</code></td>
<td><code>PersistentCompactIntMatrix</code> (one u32 per genome per slot)</td>
<td>all sources <code>with_counts=true</code></td>
</tr>
</tbody>
</table>
<hr />
<h2 id="input-output-constraints">Input / output constraints</h2>
<p>All source indexes must satisfy:</p>
<ul>
<li><code>IndexState::Indexed</code> (fully built — <code>index.done</code> sentinel present)</li>
<li>Same <code>kmer_size</code>, <code>minimizer_size</code>, <code>n_partitions</code></li>
<li>Same evidence kind: all <code>Exact</code>, or all <code>Approx</code> with identical <code>(b, z)</code> parameters</li>
<li>If <code>Count</code> mode: all sources must have <code>with_counts=true</code></li>
</ul>
<p><code>--force</code>: if the output directory already exists, it is deleted before the merge begins.</p>
<hr />
<h2 id="evidence-compatibility">Evidence compatibility</h2>
<p><code>validate_evidence_compat(sources)</code> is called before any I/O. It compares each source's <code>EvidenceKind</code> against <code>sources[0]</code>:</p>
<ul>
<li>All <code>Exact</code> → accepted, output uses <code>Exact</code></li>
<li>All <code>Approx { b, z }</code> with same <code>(b, z)</code> → accepted, output uses those parameters</li>
<li>Any other combination → <code>OKIError::IncompatibleEvidence</code>, with a message directing the user to run <code>reindex</code> first</li>
</ul>
<p>Mixed exact/approx is a hard error, not a silent conversion.</p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">validate_evidence_compat</span><span class="p">(</span><span class="n">sources</span><span class="p">:</span><span class="w"> </span><span class="kp">&amp;</span><span class="p">[</span><span class="o">&amp;</span><span class="n">KmerIndex</span><span class="p">])</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">OKIResult</span><span class="o">&lt;</span><span class="n">EvidenceKind</span><span class="o">&gt;</span>
</code></pre></div>
<hr />
<h2 id="genome-label-deduplication">Genome label deduplication</h2>
<p><code>compute_labels(sources, rename_duplicates)</code> assigns final genome labels across all sources before any file is written. The first occurrence of a label keeps the original name. Subsequent occurrences receive <code>.1</code>, <code>.2</code>, … suffixes when <code>rename_duplicates</code> is true, or trigger <code>OKIError::DuplicateGenomeLabel</code> otherwise.</p>
<hr />
<h2 id="algorithm">Algorithm</h2>
<h3 id="1-validation">1. Validation</h3>
<p>Check all sources against the constraints above. Abort on any mismatch.</p>
<h3 id="2-bootstrap-output-from-first-source">2. Bootstrap output from first source</h3>
<p>Recursive file copy of <code>sources[0]</code><code>output</code>. Immediately after the copy:</p>
<ul>
<li><code>index.meta</code> is rewritten with the final genome list (all sources, possibly renamed) and the effective evidence kind.</li>
<li>In <code>Presence</code> mode, any <code>counts/</code> directories inherited from source_0 are removed.</li>
<li><code>spectrums/</code> from source_0 is removed and rebuilt from scratch across all sources, applying the (possibly renamed) labels.</li>
</ul>
<p>This establishes the partition layout, all existing MPHFs, unitigs, and evidence files. The first source's genomes occupy columns 0 … <code>n_dst_genomes - 1</code> in the destination.</p>
<h3 id="3-for-each-subsequent-source-parallel-across-partitions">3. For each subsequent source (parallel across partitions)</h3>
<p><code>KmerPartition::merge_partition(i, sources, mode, n_dst_genomes, block_bits)</code> is called for each partition index <code>i</code>. <code>block_bits</code> is taken from <code>dst.meta.config.block_bits</code>.</p>
<p>Each entry in <code>sources</code> is <code>(&amp;KmerPartition, n_genomes)</code> where <code>n_genomes</code> is the column count that source contributes (&gt; 1 when the source is itself a merged index).</p>
<p><strong>First merge, Presence mode</strong>: when <code>n_dst_genomes == 1</code>, <code>Layer::&lt;()&gt;::init_presence_matrix</code> is called on every existing destination layer before any source column is appended. This creates <code>presence/col_000000.pbiv</code> set all-true (genome 0 is present in every slot).</p>
<p><strong>Pass 1 — classify kmers</strong></p>
<p>Iterate all kmers from all source partitions (via <code>UnitigFileReader</code> + canonical kmer iteration). For each kmer, probe the destination <code>LayeredMap&lt;()&gt;</code>:</p>
<ul>
<li><strong>Hit</strong>: kmer already in the destination; record for Pass 2.</li>
<li><strong>Miss</strong>: push kmer into a <code>GraphDeBruijn</code> accumulator.</li>
</ul>
<p><strong>New layer construction</strong></p>
<p>If the accumulator is non-empty, compute de Bruijn unitigs and call <code>Layer::&lt;()&gt;::build(&amp;new_layer_dir, block_bits)</code>. All kmers absent from the destination — across <strong>all</strong> sources — accumulate into a <strong>single</strong> graph, producing one new layer per merge operation (not one per source).</p>
<p><strong>Pass 2 — fill column builders</strong></p>
<p>For each source and each of its layers, re-iterate unitigs and look up stored values via <code>SrcLayerData::lookup(kmer, src_n)</code>:</p>
<ul>
<li><code>SrcLayerData::SetMembership</code> — no data directory exists; every kmer returns <code>vec![1; n_genomes]</code></li>
<li><code>SrcLayerData::Presence</code> — reads <code>PersistentBitMatrix</code> from <code>presence/</code></li>
<li><code>SrcLayerData::Count</code> — reads <code>PersistentCompactIntMatrix</code> from <code>counts/</code></li>
</ul>
<p>Hits are routed to <code>exist_builders[dst_layer][src_col]</code>; misses are routed to <code>new_src_builders[src_col]</code>.</p>
<p><strong>Column prepending for new layers</strong></p>
<p>Before source columns are written to the new layer, <code>n_dst_genomes</code> absent columns (all-zero / all-false) are prepended — one per genome already in the index — so the column count invariant holds immediately after layer creation.</p>
<p><strong>Close and update metadata</strong></p>
<p>Close all builders; update <code>presence/meta.json</code> or <code>counts/meta.json</code> with <code>{"n": N, "n_cols": n_dst_genomes + n_src_total}</code>; increment <code>PartitionMeta::n_layers</code> if a new layer was added.</p>
<h3 id="4-update-index-metadata">4. Update index metadata</h3>
<p><code>index.meta</code> was already written during bootstrap with the complete genome list and evidence kind. No further update is needed after the partition loop.</p>
<hr />
<h2 id="append_genome_column"><code>append_genome_column</code></h2>
<p>Defined on two concrete specialisations of <code>Layer&lt;D&gt;</code>:</p>
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o">&lt;</span><span class="n">PersistentCompactIntMatrix</span><span class="o">&gt;</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&amp;</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">OLMResult</span><span class="o">&lt;</span><span class="p">()</span><span class="o">&gt;</span>
<span class="p">}</span>
<span class="k">impl</span><span class="w"> </span><span class="n">Layer</span><span class="o">&lt;</span><span class="n">PersistentBitMatrix</span><span class="o">&gt;</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">pub</span><span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">append_genome_column</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&amp;</span><span class="nc">Path</span><span class="p">,</span><span class="w"> </span><span class="n">value_of</span><span class="p">:</span><span class="w"> </span><span class="nc">impl</span><span class="w"> </span><span class="nb">Fn</span><span class="p">(</span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">bool</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">OLMResult</span><span class="o">&lt;</span><span class="p">()</span><span class="o">&gt;</span>
<span class="p">}</span>
</code></pre></div>
<p>Each appends one column file to the matrix subdirectory (<code>counts/</code> or <code>presence/</code>). In <code>merge_partition</code>, columns are written directly via <code>PersistentBitVecBuilder</code> / <code>PersistentCompactIntVecBuilder</code> rather than through these helpers, but the invariant they enforce is the same.</p>
<hr />
<h2 id="column-count-invariant">Column count invariant</h2>
<p>After any merge, <strong>every layer in every partition has exactly <code>n_genomes</code> columns</strong>, where <code>n_genomes</code> is the total genome count in the index at that point.</p>
<p>Maintained by three mechanisms:</p>
<ol>
<li><strong>Existing layers</strong>: <code>n_src_total</code> columns appended (one per source genome).</li>
<li><strong>New layers created during merge</strong>: <code>n_dst_genomes</code> absent columns prepended before source columns.</li>
<li><strong>First merge, Presence mode</strong>: <code>init_presence_matrix</code> retroactively creates <code>presence/col_0</code> all-true for genome 0.</li>
</ol>
<p>The invariant is a precondition of <code>LayeredStore</code> aggregation traits: <code>col_weights()</code> and all partial distance methods assume every inner store has the same column count.</p>
<hr />
<h2 id="error-variants-relevant-to-merge">Error variants relevant to merge</h2>
<table>
<thead>
<tr>
<th>Variant</th>
<th>Condition</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>OKIError::NotIndexed(path)</code></td>
<td>Source not in <code>Indexed</code> state</td>
</tr>
<tr>
<td><code>OKIError::IncompatibleConfig</code></td>
<td>Mismatched <code>kmer_size</code>, <code>minimizer_size</code>, or <code>n_partitions</code></td>
</tr>
<tr>
<td><code>OKIError::MismatchedMode</code></td>
<td>Count mode but a source has <code>with_counts=false</code></td>
</tr>
<tr>
<td><code>OKIError::IncompatibleEvidence(msg)</code></td>
<td>Mixed exact/approx or different approx <code>(b, z)</code></td>
</tr>
<tr>
<td><code>OKIError::DuplicateGenomeLabel(label)</code></td>
<td>Duplicate label and <code>rename_duplicates=false</code></td>
</tr>
</tbody>
</table>
<hr />
<h2 id="on-disk-impact">On-disk impact</h2>
<p>After merging <code>G</code> genomes (sources_0 contributes <code>G0</code>, subsequent sources the rest):</p>
<div class="highlight"><pre><span></span><code>partitions/
part_00000/
index/
meta.json ← n_layers updated if new layer added
layer_0/
mphf.bin ← unchanged
unitigs.bin ← unchanged
evidence.bin ← unchanged
presence/ ← created on first merge (Presence mode)
meta.json {&quot;n&quot;: N, &quot;n_cols&quot;: G}
col_000000.pbiv ← all-true (genome 0 … G0-1)
col_000001.pbiv ← next source
...
counts/ ← extended (Count mode)
meta.json {&quot;n&quot;: N, &quot;n_cols&quot;: G}
col_000000.pciv ← genome 0 counts (from original build)
col_000001.pciv ← next source
...
layer_N/ ← new layer (if new kmers found)
mphf.bin
unitigs.bin
evidence.bin
presence/ or counts/
meta.json {&quot;n&quot;: N1, &quot;n_cols&quot;: G}
col_000000.pbiv ← all-false (absent for existing genomes)
...
spectrums/
&lt;label&gt;.json ← one file per genome, rebuilt from all sources
index.meta ← complete genome list + evidence kind written at bootstrap
</code></pre></div>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>