Files
obikmer/doc/implementation/unitig_evidence/index.html
T

1623 lines
36 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../mphf/">
<link rel="next" href="../evidence_elimination/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Unitig evidence encoding - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#unitig-based-mphf-evidence-encoding" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/minimizer/" class="md-nav__link">
<span class="md-ellipsis">
Minimizer selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obipipeline/" class="md-nav__link">
<span class="md-ellipsis">
obipipeline library
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#role-of-unitigs-in-the-index" class="md-nav__link">
<span class="md-ellipsis">
Role of unitigs in the index
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#binary-file-formats" class="md-nav__link">
<span class="md-ellipsis">
Binary file formats
</span>
</a>
<nav class="md-nav" aria-label="Binary file formats">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#unitigsbin-sequence-chunks" class="md-nav__link">
<span class="md-ellipsis">
unitigs.bin — sequence chunks
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#unitigsbinidx-block-sampled-offset-index" class="md-nav__link">
<span class="md-ellipsis">
unitigs.bin.idx — block-sampled offset index
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidencebin-per-slot-mphf-evidence" class="md-nav__link">
<span class="md-ellipsis">
evidence.bin — per-slot MPHF evidence
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#building-and-reading-the-index" class="md-nav__link">
<span class="md-ellipsis">
Building and reading the index
</span>
</a>
<nav class="md-nav" aria-label="Building and reading the index">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#build_unitig_idxpath-block_bits" class="md-nav__link">
<span class="md-ellipsis">
build_unitig_idx(path, block_bits)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#open-open_sequential-open_direct_access" class="md-nav__link">
<span class="md-ellipsis">
open(), open_sequential(), open_direct_access()
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#chunk_starti-access-modes" class="md-nav__link">
<span class="md-ellipsis">
chunk_start(i) — access modes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#decoding-a-kmer-from-slot-s" class="md-nav__link">
<span class="md-ellipsis">
Decoding a kmer from slot s
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#field-widths-and-capacity" class="md-nav__link">
<span class="md-ellipsis">
Field widths and capacity
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidence-bit-cost" class="md-nav__link">
<span class="md-ellipsis">
Evidence bit-cost
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#unitig-decomposition-non-determinism" class="md-nav__link">
<span class="md-ellipsis">
Unitig decomposition non-determinism
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#partition-size-tradeoff" class="md-nav__link">
<span class="md-ellipsis">
Partition-size tradeoff
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#alternative-fingerprint-evidence" class="md-nav__link">
<span class="md-ellipsis">
Alternative: fingerprint evidence
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../evidence_elimination/" class="md-nav__link">
<span class="md-ellipsis">
Evidence elimination (discussion)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../merge/" class="md-nav__link">
<span class="md-ellipsis">
Merge command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../rebuild_filter/" class="md-nav__link">
<span class="md-ellipsis">
Kmer filtering (rebuild/dump/unitig)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#role-of-unitigs-in-the-index" class="md-nav__link">
<span class="md-ellipsis">
Role of unitigs in the index
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#binary-file-formats" class="md-nav__link">
<span class="md-ellipsis">
Binary file formats
</span>
</a>
<nav class="md-nav" aria-label="Binary file formats">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#unitigsbin-sequence-chunks" class="md-nav__link">
<span class="md-ellipsis">
unitigs.bin — sequence chunks
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#unitigsbinidx-block-sampled-offset-index" class="md-nav__link">
<span class="md-ellipsis">
unitigs.bin.idx — block-sampled offset index
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidencebin-per-slot-mphf-evidence" class="md-nav__link">
<span class="md-ellipsis">
evidence.bin — per-slot MPHF evidence
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#building-and-reading-the-index" class="md-nav__link">
<span class="md-ellipsis">
Building and reading the index
</span>
</a>
<nav class="md-nav" aria-label="Building and reading the index">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#build_unitig_idxpath-block_bits" class="md-nav__link">
<span class="md-ellipsis">
build_unitig_idx(path, block_bits)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#open-open_sequential-open_direct_access" class="md-nav__link">
<span class="md-ellipsis">
open(), open_sequential(), open_direct_access()
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#chunk_starti-access-modes" class="md-nav__link">
<span class="md-ellipsis">
chunk_start(i) — access modes
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#decoding-a-kmer-from-slot-s" class="md-nav__link">
<span class="md-ellipsis">
Decoding a kmer from slot s
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#field-widths-and-capacity" class="md-nav__link">
<span class="md-ellipsis">
Field widths and capacity
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#evidence-bit-cost" class="md-nav__link">
<span class="md-ellipsis">
Evidence bit-cost
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#unitig-decomposition-non-determinism" class="md-nav__link">
<span class="md-ellipsis">
Unitig decomposition non-determinism
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#partition-size-tradeoff" class="md-nav__link">
<span class="md-ellipsis">
Partition-size tradeoff
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#alternative-fingerprint-evidence" class="md-nav__link">
<span class="md-ellipsis">
Alternative: fingerprint evidence
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="unitig-based-mphf-evidence-encoding">Unitig-based MPHF evidence encoding</h1>
<h2 id="role-of-unitigs-in-the-index">Role of unitigs in the index</h2>
<p>The MPHF maps each canonical kmer to an integer slot but provides no inverse: a slot index alone cannot reconstruct the kmer. The <strong>evidence file</strong> supplies this inverse: for each MPHF slot it stores a pointer into the unitig sequence file, from which k nucleotides can be extracted.</p>
<p>Unitigs are the natural compact representation: a run of L nucleotides encodes L k + 1 consecutive canonical kmers. The entire kmer set of a partition is reconstructible from its unitig binary file.</p>
<hr />
<h2 id="binary-file-formats">Binary file formats</h2>
<h3 id="unitigsbin-sequence-chunks"><code>unitigs.bin</code> — sequence chunks</h3>
<p>A sequence of binary records. Each record:</p>
<div class="highlight"><pre><span></span><code>[u8: seql k] [ceil(seql / 4) bytes: 2-bit packed nucleotides]
</code></pre></div>
<ul>
<li><code>seql k</code> (0255): nucleotide length minus k, so <code>seql = byte[0] + k</code> and <code>n_kmers = byte[0] + 1</code>.</li>
<li>Packed nucleotides: A=00, C=01, G=10, T=11, MSB-first within each byte; last byte zero-padded.</li>
<li>Byte count for packed sequence: <code>ceil(seql / 4)</code>.</li>
</ul>
<p>Unitigs with more than <code>MAX_KMERS_PER_CHUNK = 256</code> k-mers are transparently split into overlapping chunks. Each chunk has at most 256 k-mers (= <code>seql k + 1 ≤ 256</code>); consecutive chunks overlap by k1 nucleotides so no kmer is lost:</p>
<div class="highlight"><pre><span></span><code>chunk 1: nucleotides [0, MAX_KMERS_PER_CHUNK + k 2] (256 kmers)
chunk 2: nucleotides [256, end] (remaining kmers)
overlap: k1 nucleotides shared between the two chunks
</code></pre></div>
<h3 id="unitigsbinidx-block-sampled-offset-index"><code>unitigs.bin.idx</code> — block-sampled offset index</h3>
<div class="highlight"><pre><span></span><code>magic : 4 bytes = &quot;UIX3&quot;
block_bits: u32 LE — granularity parameter (031)
n_unitigs : u32 LE — total number of chunks in unitigs.bin
n_kmers : u64 LE — total number of kmers across all chunks
offsets : [u32 LE] — byte offsets into unitigs.bin, one per 2^block_bits chunks + sentinel
</code></pre></div>
<p>One offset entry is stored every <code>2^block_bits</code> chunks; the array is sentinel-terminated (last entry = file size). <code>DEFAULT_BLOCK_BITS = 0</code> stores one offset per chunk (exact table, no scan).</p>
<h3 id="evidencebin-per-slot-mphf-evidence"><code>evidence.bin</code> — per-slot MPHF evidence</h3>
<p>A flat array of u32 values, one per MPHF slot, no header:</p>
<div class="highlight"><pre><span></span><code>bits [31:7] = chunk_id (25 bits)
bits [6:0] = rank (7 bits, 0127)
</code></pre></div>
<p>File size = <code>n_slots × 4</code> bytes. <code>chunk_id</code> is the 0-based index of the record in <code>unitigs.bin</code>; <code>rank</code> is the position of the canonical kmer within that chunk (counting only canonical kmers). Encoding: <code>raw = (chunk_id &lt;&lt; 7) | (rank &amp; 0x7F)</code>. Decoding: <code>chunk_id = raw &gt;&gt; 7</code>, <code>rank = raw &amp; 0x7F</code>.</p>
<hr />
<h2 id="building-and-reading-the-index">Building and reading the index</h2>
<h3 id="build_unitig_idxpath-block_bits"><code>build_unitig_idx(path, block_bits)</code></h3>
<p>Scans <code>unitigs.bin</code> sequentially: for each chunk at byte offset <code>offset</code>, if <code>chunk_count &amp; mask == 0</code> (where <code>mask = (1 &lt;&lt; block_bits) 1</code>), appends <code>offset as u32</code> to <code>block_offsets</code>. After the scan, appends a sentinel (= total file size), then writes the <code>.idx</code> file. Called after the unitig file is fully written and closed.</p>
<h3 id="open-open_sequential-open_direct_access"><code>open()</code>, <code>open_sequential()</code>, <code>open_direct_access()</code></h3>
<p><code>UnitigFileReader</code> has three constructors:</p>
<ul>
<li><code>open(path)</code> — smart default: if <code>unitigs.bin.idx</code> exists, delegates to <code>open_direct_access</code>; otherwise delegates to <code>open_sequential</code>. Prefer this in call sites that don't require one specific mode.</li>
<li><code>open_sequential(path)</code> — never reads <code>.idx</code>. Sequential iterators only; <code>chunk_start(i)</code> falls back to an O(i) mmap scan rather than panicking.</li>
<li><code>open_direct_access(path)</code> — requires <code>.idx</code> to be present. Enables O(1) or O(2^block_bits) <code>chunk_start(i)</code>, used by <code>verify_canonical_kmer</code> at query time.</li>
</ul>
<p><code>CanonicalKmerIter</code> — a clonable sequential iterator returned by <code>UnitigFileReader::iter_canonical_kmers()</code>. It holds an <code>Arc&lt;Mmap&gt;</code> so cloning resets the cursor to the start without reopening the file. This makes it usable with <code>par_bridge()</code> for parallel MPHF construction without random access.</p>
<h3 id="chunk_starti-access-modes"><code>chunk_start(i)</code> — access modes</h3>
<p>When <code>.idx</code> is loaded (<code>open_direct_access</code>):</p>
<ul>
<li><code>block_bits = 0</code>: single array lookup, O(1).</li>
<li><code>block_bits &gt; 0</code>: lookup block, then scan ≤ 2^block_bits records, O(2^block_bits).</li>
</ul>
<p>When <code>.idx</code> is absent (<code>open_sequential</code>): <code>chunk_start(i)</code> performs an O(i) sequential mmap scan from offset 0. No panic — the function degrades gracefully. This degraded path is used by <code>find_strict()</code> on Approx layers (sequential scan of all canonical kmers).</p>
<h3 id="decoding-a-kmer-from-slot-s">Decoding a kmer from slot <code>s</code></h3>
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="p">(</span><span class="n">chunk_id</span><span class="p">,</span><span class="w"> </span><span class="n">rank</span><span class="p">)</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">evidence</span><span class="p">.</span><span class="n">decode</span><span class="p">(</span><span class="n">s</span><span class="p">);</span><span class="w"> </span><span class="c1">// u32 → (chunk_id: u32, rank: u8)</span>
<span class="kd">let</span><span class="w"> </span><span class="n">kmer</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">unitigs</span><span class="p">.</span><span class="n">raw_kmer</span><span class="p">(</span><span class="n">chunk_id</span><span class="p">,</span><span class="w"> </span><span class="n">rank</span><span class="p">);</span><span class="w"> </span><span class="c1">// 2-bit packed slice → left-aligned u64</span>
</code></pre></div>
<p>Two memory accesses: one 4-byte read from <code>evidence.bin</code>, one packed-bit extraction from <code>unitigs.bin</code> via the mmap. The retrieved sequence is already canonical (only canonical kmers are inserted into the De Bruijn graph).</p>
<hr />
<h2 id="field-widths-and-capacity">Field widths and capacity</h2>
<table>
<thead>
<tr>
<th>field</th>
<th>bits</th>
<th>range</th>
<th>capacity check (<em>B. nana</em>, 256 partitions)</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>seql k</code></td>
<td>8</td>
<td>0255</td>
<td>max <code>n_kmers</code> per chunk = 256 = <code>MAX_KMERS_PER_CHUNK</code></td>
</tr>
<tr>
<td><code>rank</code></td>
<td>7</td>
<td>0127</td>
<td>observed max ~46 kmers/chunk; structural max km+1 = 21</td>
</tr>
<tr>
<td><code>chunk_id</code></td>
<td>25</td>
<td>033 554 431</td>
<td>avg U ≈ 275 k chunks/partition</td>
</tr>
</tbody>
</table>
<p>The rank field is 7 bits (max 127) even though chunks can contain up to 256 k-mers, because rank counts only canonical kmers within the chunk, and the canonical kmer count is at most half the total.</p>
<hr />
<h2 id="evidence-bit-cost">Evidence bit-cost</h2>
<p>Strategy B (chunk_id + rank) is the implemented strategy. For <em>B. nana</em> (k=31, 256 partitions, P ≈ 10.4 M unique kmers/partition, U ≈ 275 k chunks/partition, m_u ≈ 37.9 kmers/chunk):</p>
<table>
<thead>
<tr>
<th>field</th>
<th>theoretical cost</th>
<th>value</th>
</tr>
</thead>
<tbody>
<tr>
<td>chunk_id</td>
<td>⌈log₂ U⌉</td>
<td>19 bits</td>
</tr>
<tr>
<td>rank</td>
<td>⌈log₂ m_u⌉ (≈ fixed)</td>
<td>6 bits</td>
</tr>
<tr>
<td><strong>stored</strong></td>
<td>aligned u32</td>
<td><strong>32 bits/slot</strong></td>
</tr>
</tbody>
</table>
<p>The u32 layout is chosen for alignment and simplicity; no bit-addressing arithmetic is needed.</p>
<p>Comparison with strategy A (global nucleotide offset): <code>⌈log₂(P · (1 + (k1)/m_u))⌉ = 25 bits</code>. Strategy A is theoretically 2 bits cheaper; strategy B's advantage is <strong>locality</strong> (decoding touches one chunk's cache lines) and a bounded, constant-width rank field independent of partition size.</p>
<hr />
<h2 id="unitig-decomposition-non-determinism">Unitig decomposition non-determinism</h2>
<p>The unitig extraction from <code>GraphDeBruijn</code> is <strong>not deterministic</strong>: two runs on identical input can produce different unitig counts and sequences while covering exactly the same canonical kmer set.</p>
<p>The hash map (<code>hashbrown::HashMap</code> with <code>Xxh3Builder</code>) has run-dependent iteration order. The <code>start_iter</code> first pass emits every node where <code>can_extend_left</code> is false — this includes true dead-ends and branch points (nodes with ≥2 left neighbours). When a branch point is encountered before its upstream neighbours, it claims the downstream chain and those upstream neighbours later produce length-k degenerate unitigs. When upstream neighbours appear first, they extend through the branch point.</p>
<p><strong>Example</strong> — fork topology (k = 31):</p>
<div class="highlight"><pre><span></span><code>A → B ← C
D
</code></pre></div>
<p>B has two left neighbours, so <code>can_extend_left = false</code>. Two valid tilings:</p>
<table>
<thead>
<tr>
<th>iteration order</th>
<th>unitigs</th>
<th>count</th>
</tr>
</thead>
<tbody>
<tr>
<td>A first</td>
<td>ABD, C</td>
<td>2</td>
</tr>
<tr>
<td>B first</td>
<td>BD, A, C</td>
<td>3</td>
</tr>
</tbody>
</table>
<p>Both cover the same 4 canonical kmers. Pure cycles are unaffected: all cycle nodes have both extensions present, so none are emitted in the first pass; each cycle produces exactly one unitig regardless of entry point (only the cut point varies).</p>
<p>This non-determinism is benign for MPHF construction: the MPHF is built from the kmer set, which is identical across tilings.</p>
<hr />
<h2 id="partition-size-tradeoff">Partition-size tradeoff</h2>
<p>Measured on <em>B. nana</em> (k=31, m=11), summing across all partitions:</p>
<table>
<thead>
<tr>
<th>N partitions</th>
<th>m_u</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>41.89</td>
</tr>
<tr>
<td>16</td>
<td>38.19</td>
</tr>
<tr>
<td>256</td>
<td>37.90</td>
</tr>
<tr>
<td>1 024</td>
<td>37.89</td>
</tr>
</tbody>
</table>
<p><code>m_u</code> is set by De Bruijn graph topology (heterozygosity, repeats, sequencing errors), not partition count. The variation from 1 to 1024 partitions is under 10%; within 161024 it is under 1%. Unitigs provide ~3.1× nucleotide compaction over super-kmers at 256 partitions.</p>
<p>Evidence cost decreases by 1 bit/kmer with each doubling of partition count (via <code>log₂ U = log₂(P/m_u)</code>). The sequence storage term <code>2 · (1 + (k1)/m_u) ≈ 3.6 bits/kmer</code> is approximately constant.</p>
<hr />
<h2 id="alternative-fingerprint-evidence">Alternative: fingerprint evidence</h2>
<p><code>evidence.bin</code> can be replaced by <code>fingerprint.bin</code> at index build time (<code>--approx</code>) or after the fact (<code>reindex --approx</code>). The fingerprint stores b bits per MPHF slot (the low b bits of <code>kmer.seq_hash()</code>); verification becomes a single bitfield comparison instead of a unitig dereference. False-positive rate per k-mer query: 1/2^b. With the Findere z parameter, z consecutive k-mers must all match, reducing the effective window FP rate to 1/2^(b·z) while skipping z1 of every z k-mers. No <code>.idx</code> file is written or read in approx mode.</p>
<p>See <a href="../evidence_elimination/">Approximate evidence (Findere fingerprint)</a> for the full design and CLI parameters.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>