Files
obikmer/doc/implementation/persistent_bit_vec/index.html
T

1764 lines
48 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="prev" href="../persistent_compact_int_vec/">
<link rel="next" href="../merge/">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>PersistentBitVec - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#persistentbitvec-and-persistentbitmatrix" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
PersistentBitVec
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/minimizer/" class="md-nav__link">
<span class="md-ellipsis">
Minimizer selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obipipeline/" class="md-nav__link">
<span class="md-ellipsis">
obipipeline library
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../unitig_evidence/" class="md-nav__link">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../evidence_elimination/" class="md-nav__link">
<span class="md-ellipsis">
Evidence elimination (discussion)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--active">
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
<label class="md-nav__link md-nav__link--active" for="__toc">
<span class="md-ellipsis">
PersistentBitVec
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<a href="./" class="md-nav__link md-nav__link--active">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#purpose" class="md-nav__link">
<span class="md-ellipsis">
Purpose
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#persistentbitvec-single-column-file" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec — single-column file
</span>
</a>
<nav class="md-nav" aria-label="PersistentBitVec — single-column file">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#file-format" class="md-nav__link">
<span class="md-ellipsis">
File format
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#lifecycle" class="md-nav__link">
<span class="md-ellipsis">
Lifecycle
</span>
</a>
<nav class="md-nav" aria-label="Lifecycle">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#builder-persistentbitvecbuilder" class="md-nav__link">
<span class="md-ellipsis">
Builder (PersistentBitVecBuilder)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reader-persistentbitvec" class="md-nav__link">
<span class="md-ellipsis">
Reader (PersistentBitVec)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#implementation-notes" class="md-nav__link">
<span class="md-ellipsis">
Implementation notes
</span>
</a>
<nav class="md-nav" aria-label="Implementation notes">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#u64-word-view" class="md-nav__link">
<span class="md-ellipsis">
u64 word view
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#padding-invariant" class="md-nav__link">
<span class="md-ellipsis">
Padding invariant
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#complexity" class="md-nav__link">
<span class="md-ellipsis">
Complexity
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#persistentbitmatrix-column-major-directory" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitMatrix — column-major directory
</span>
</a>
<nav class="md-nav" aria-label="PersistentBitMatrix — column-major directory">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#design" class="md-nav__link">
<span class="md-ellipsis">
Design
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#builder-persistentbitmatrixbuilder" class="md-nav__link">
<span class="md-ellipsis">
Builder (PersistentBitMatrixBuilder)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reader-persistentbitmatrix" class="md-nav__link">
<span class="md-ellipsis">
Reader (PersistentBitMatrix)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layerdata-implementation" class="md-nav__link">
<span class="md-ellipsis">
LayerData implementation
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
<span class="md-ellipsis">
Aggregation traits — obicompactvec::traits
</span>
</a>
<nav class="md-nav" aria-label="Aggregation traits — obicompactvec::traits">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#columnweights" class="md-nav__link">
<span class="md-ellipsis">
ColumnWeights
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#bitpartials" class="md-nav__link">
<span class="md-ellipsis">
BitPartials
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="../merge/" class="md-nav__link">
<span class="md-ellipsis">
Merge command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../rebuild_filter/" class="md-nav__link">
<span class="md-ellipsis">
Kmer filtering (rebuild/dump/unitig)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../architecture/index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#purpose" class="md-nav__link">
<span class="md-ellipsis">
Purpose
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#persistentbitvec-single-column-file" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec — single-column file
</span>
</a>
<nav class="md-nav" aria-label="PersistentBitVec — single-column file">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#file-format" class="md-nav__link">
<span class="md-ellipsis">
File format
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#lifecycle" class="md-nav__link">
<span class="md-ellipsis">
Lifecycle
</span>
</a>
<nav class="md-nav" aria-label="Lifecycle">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#builder-persistentbitvecbuilder" class="md-nav__link">
<span class="md-ellipsis">
Builder (PersistentBitVecBuilder)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reader-persistentbitvec" class="md-nav__link">
<span class="md-ellipsis">
Reader (PersistentBitVec)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#implementation-notes" class="md-nav__link">
<span class="md-ellipsis">
Implementation notes
</span>
</a>
<nav class="md-nav" aria-label="Implementation notes">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#u64-word-view" class="md-nav__link">
<span class="md-ellipsis">
u64 word view
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#padding-invariant" class="md-nav__link">
<span class="md-ellipsis">
Padding invariant
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#complexity" class="md-nav__link">
<span class="md-ellipsis">
Complexity
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#persistentbitmatrix-column-major-directory" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitMatrix — column-major directory
</span>
</a>
<nav class="md-nav" aria-label="PersistentBitMatrix — column-major directory">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#design" class="md-nav__link">
<span class="md-ellipsis">
Design
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#builder-persistentbitmatrixbuilder" class="md-nav__link">
<span class="md-ellipsis">
Builder (PersistentBitMatrixBuilder)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#reader-persistentbitmatrix" class="md-nav__link">
<span class="md-ellipsis">
Reader (PersistentBitMatrix)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#layerdata-implementation" class="md-nav__link">
<span class="md-ellipsis">
LayerData implementation
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
<span class="md-ellipsis">
Aggregation traits — obicompactvec::traits
</span>
</a>
<nav class="md-nav" aria-label="Aggregation traits — obicompactvec::traits">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#columnweights" class="md-nav__link">
<span class="md-ellipsis">
ColumnWeights
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#bitpartials" class="md-nav__link">
<span class="md-ellipsis">
BitPartials
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="persistentbitvec-and-persistentbitmatrix">PersistentBitVec and PersistentBitMatrix</h1>
<h2 id="purpose">Purpose</h2>
<p><code>PersistentBitVec</code> stores a dense bit vector (presence/absence per slot) backed by a single mmap'd file. It is the binary counterpart of <code>PersistentCompactIntVec</code> and shares the same lifecycle pattern (builder → close → reader). All bulk operations work on u64 words rather than bytes, giving 8× fewer iterations and enabling the compiler to emit POPCNT and SIMD instructions.</p>
<p>Typical use: converting k-mer count vectors to presence/absence vectors (with optional threshold), then computing set-theoretic distances (Jaccard) or edit distances (Hamming) between samples.</p>
<p><code>PersistentBitMatrix</code> wraps multiple <code>PersistentBitVec</code> columns in a directory, exposing a column-major binary matrix with row-access API. A single-column bit matrix is a vector at the API level.</p>
<hr />
<h2 id="persistentbitvec-single-column-file">PersistentBitVec — single-column file</h2>
<h3 id="file-format">File format</h3>
<p>Single <code>.pbiv</code> file.</p>
<div class="highlight"><pre><span></span><code>offset 0:
magic: [u8; 4] = b&quot;PBIV&quot;
_pad: [u8; 4] = 0 alignment padding
n: u64 number of bits
offset 16:
data: [u64; ⌈n/64⌉] bit words, LSB-first, zero-padded
</code></pre></div>
<p><strong>Header is 16 bytes</strong>, so data starts at an offset divisible by 8. Since <code>mmap</code> returns page-aligned memory (≥ 4096-byte aligned), the data slice is u64-aligned, enabling a zero-copy <code>&amp;[u8] → &amp;[u64]</code> reinterpretation.</p>
<p><strong>Bit layout</strong>: bit <code>i</code> is in <code>data[i &gt;&gt; 6]</code> at bit position <code>i &amp; 63</code> (LSB-first). Bits <code>[n, ⌈n/64⌉×64)</code> are <strong>always zero</strong> (padding). This invariant is maintained by all write operations and must be restored by <code>not()</code> after flipping.</p>
<p><strong>Total file size</strong>: <code>16 + ⌈n/64⌉ × 8</code> bytes.</p>
<h3 id="lifecycle">Lifecycle</h3>
<h4 id="builder-persistentbitvecbuilder">Builder (<code>PersistentBitVecBuilder</code>)</h4>
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentBitVecBuilder</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">mmap</span><span class="p">:</span><span class="w"> </span><span class="nc">MmapMut</span><span class="p">,</span>
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="p">}</span>
</code></pre></div>
<p>The file and mmap are created immediately at construction. The header is written once at <code>new()</code> or copied from the source at <code>build_from*()</code>. <code>close()</code> is a single flush — there is no tail to append, unlike <code>PersistentCompactIntVec</code>.</p>
<p><strong><code>new(n: usize, path: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Creates the file, writes the header, zero-extends to <code>16 + ⌈n/64⌉×8</code> bytes, mmaps immediately. All bits default to 0.</p>
<p><strong><code>build_from(source: &amp;PersistentBitVec, path: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>OS-level file copy (no per-bit iteration), then mmap. Initialisation cost: O(file_size).</p>
<p><strong><code>build_from_counts(source: &amp;PersistentCompactIntVec, threshold: u32, path: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Creates a new file, iterates <code>source</code> with its merge-scan iterator (O(n)), and writes bits directly into u64 words:</p>
<div class="highlight"><pre><span></span><code><span class="c1">// bit i = 1 iff source[i] &gt;= threshold</span>
<span class="n">words</span><span class="p">[</span><span class="n">slot</span><span class="w"> </span><span class="o">&gt;&gt;</span><span class="w"> </span><span class="mi">6</span><span class="p">]</span><span class="w"> </span><span class="o">|=</span><span class="w"> </span><span class="mi">1</span><span class="k">u64</span><span class="w"> </span><span class="o">&lt;&lt;</span><span class="w"> </span><span class="p">(</span><span class="n">slot</span><span class="w"> </span><span class="o">&amp;</span><span class="w"> </span><span class="mi">63</span><span class="p">);</span>
</code></pre></div>
<p>Handles overflow values (≥ 255) transparently — the count iterator returns the true u32 value regardless.</p>
<p><strong><code>build_from_presence(source: &amp;PersistentCompactIntVec, path: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Shorthand for <code>build_from_counts(source, 1, path)</code>.</p>
<p><strong>Bit-level access</strong></p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">get</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">bool</span>
<span class="nc">fn</span><span class="w"> </span><span class="n">set</span><span class="p">(</span><span class="o">&amp;</span><span class="k">mut</span><span class="w"> </span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="n">value</span><span class="p">:</span><span class="w"> </span><span class="kt">bool</span><span class="p">)</span>
</code></pre></div>
<p>Byte-level mmap access: <code>mmap[16 + slot/8]</code>, bit <code>slot % 8</code>. O(1).</p>
<p><strong>Word-level bulk operations</strong></p>
<p>All operate on <code>⌈n/64⌉</code> u64 words. O(n/64) per call.</p>
<div class="highlight"><pre><span></span><code><span class="n">builder</span><span class="p">.</span><span class="n">and</span><span class="p">(</span><span class="o">&amp;</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] &amp;= other[i] for all i</span>
<span class="n">builder</span><span class="p">.</span><span class="n">or</span><span class="p">(</span><span class="o">&amp;</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] |= other[i]</span>
<span class="n">builder</span><span class="p">.</span><span class="n">xor</span><span class="p">(</span><span class="o">&amp;</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] ^= other[i]</span>
<span class="n">builder</span><span class="p">.</span><span class="n">not</span><span class="p">();</span><span class="w"> </span><span class="c1">// self[i] = !self[i], then re-zero padding bits</span>
</code></pre></div>
<p><code>and</code>/<code>or</code>/<code>xor</code> read <code>other</code>'s word slice directly (no allocation). <code>not()</code> flips all words then masks the last word's padding bits to restore the invariant.</p>
<p><strong><code>close(self) -&gt; io::Result&lt;()&gt;</code></strong></p>
<p>Flushes the mmap. The header was written at construction and is never rewritten. O(1) in Rust code.</p>
<h4 id="reader-persistentbitvec">Reader (<code>PersistentBitVec</code>)</h4>
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentBitVec</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">mmap</span><span class="p">:</span><span class="w"> </span><span class="nc">Mmap</span><span class="p">,</span>
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="w"> </span><span class="n">path</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
<span class="p">}</span>
</code></pre></div>
<p><strong><code>open(path: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Mmaps the file, validates magic, reads <code>n</code> from bytes <code>[8..16]</code>. O(1).</p>
<p><strong><code>get(slot: usize) -&gt; bool</code></strong></p>
<p>Byte-level read from <code>mmap[16 + slot/8]</code>. O(1).</p>
<p><strong><code>iter() -&gt; BitIter&lt;'_&gt;</code></strong></p>
<p>Sequential scan, byte by byte, yielding <code>bool</code> values in slot order. Implements <code>ExactSizeIterator</code>. O(n).</p>
<p><strong>Aggregates</strong></p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">count_ones</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="c1">// popcount over all words; padding bits are 0</span>
<span class="k">fn</span><span class="w"> </span><span class="nf">count_zeros</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="c1">// n - count_ones()</span>
</code></pre></div>
<p><code>count_ones</code> iterates <code>⌈n/64⌉</code> words and calls <code>u64::count_ones()</code> (maps to <code>POPCNT</code>). O(n/64).</p>
<p><strong>Distance methods</strong></p>
<p>Both operate word by word. O(n/64).</p>
<table>
<thead>
<tr>
<th>Method</th>
<th>Formula</th>
<th>Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>jaccard_dist(&amp;other) -&gt; f64</code></td>
<td><code>1 \|A∩B\| / \|AB\|</code></td>
<td><code>(a&amp;b).count_ones()</code>, <code>(a\|b).count_ones()</code> per word</td>
</tr>
<tr>
<td><code>hamming_dist(&amp;other) -&gt; u64</code></td>
<td>number of differing bits</td>
<td><code>(a^b).count_ones()</code> per word</td>
</tr>
</tbody>
</table>
<p>Edge case (both all-zero → union = 0): <code>jaccard_dist</code> returns 0.0.</p>
<h3 id="implementation-notes">Implementation notes</h3>
<h4 id="u64-word-view">u64 word view</h4>
<p>The unsafe cast from <code>&amp;[u8]</code> to <code>&amp;[u64]</code> is sound because:</p>
<ol>
<li><code>mmap</code> base is page-aligned (≥ 4096-byte boundary).</li>
<li>Data offset = 16, and <code>16 % 8 == 0</code> → the data pointer is 8-byte aligned.</li>
<li>Data length = <code>⌈n/64⌉ × 8</code> bytes — always a multiple of 8.</li>
</ol>
<p>This gives zero-copy word-level access with no intermediate allocation.</p>
<h4 id="padding-invariant">Padding invariant</h4>
<p>Writing <code>not()</code> without masking the last word would corrupt <code>count_ones()</code>, <code>hamming_dist()</code>, and <code>jaccard_dist()</code>. The mask applied after flipping is <code>(1u64 &lt;&lt; (n % 64)) - 1</code> (no-op if <code>n % 64 == 0</code>). All other operations (<code>and</code>, <code>or</code>, <code>xor</code>) preserve existing zero padding since they can only clear or preserve bits already set by <code>not()</code>.</p>
<h3 id="complexity">Complexity</h3>
<table>
<thead>
<tr>
<th>Operation</th>
<th>Time</th>
<th>Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>new</code> / <code>open</code></td>
<td>O(1)</td>
<td>mmap setup + header parse</td>
</tr>
<tr>
<td><code>get</code> / <code>set</code> (builder or reader)</td>
<td>O(1)</td>
<td>byte-level mmap</td>
</tr>
<tr>
<td><code>iter()</code></td>
<td>O(n)</td>
<td>byte-by-byte scan</td>
</tr>
<tr>
<td><code>count_ones</code> / <code>count_zeros</code></td>
<td>O(n/64)</td>
<td>POPCNT per u64 word</td>
</tr>
<tr>
<td><code>and</code> / <code>or</code> / <code>xor</code> / <code>not</code></td>
<td>O(n/64)</td>
<td>word-level bitwise ops</td>
</tr>
<tr>
<td><code>jaccard_dist</code> / <code>hamming_dist</code></td>
<td>O(n/64)</td>
<td>word AND/OR/XOR + POPCNT</td>
</tr>
<tr>
<td><code>build_from</code></td>
<td>O(file_size)</td>
<td>OS copy</td>
</tr>
<tr>
<td><code>build_from_counts</code> / <code>build_from_presence</code></td>
<td>O(n)</td>
<td>count iter + word fill</td>
</tr>
<tr>
<td><code>close</code></td>
<td>O(1)</td>
<td>flush only</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="persistentbitmatrix-column-major-directory">PersistentBitMatrix — column-major directory</h2>
<h3 id="design">Design</h3>
<p>A directory containing <code>meta.json</code> and N column files <code>col_000000.pbiv</code>, <code>col_000001.pbiv</code>, …, each a <code>PersistentBitVec</code>. Used for presence/absence matrices: one column per genome, one bit per MPHF slot.</p>
<div class="highlight"><pre><span></span><code>presence/
meta.json {&quot;n&quot;: &lt;n_slots&gt;, &quot;n_cols&quot;: &lt;G&gt;}
col_000000.pbiv genome 0
col_000001.pbiv genome 1
...
</code></pre></div>
<p>Column-major layout makes per-genome set operations (Jaccard, Hamming, AND/OR) cache-friendly — each genome is a contiguous file. Row access (which genomes contain a given kmer) requires one O(1) read per column.</p>
<h3 id="builder-persistentbitmatrixbuilder">Builder (<code>PersistentBitMatrixBuilder</code>)</h3>
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentBitMatrixBuilder</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">dir</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="w"> </span><span class="n">n_cols</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="p">}</span>
</code></pre></div>
<p><strong><code>new(n: usize, dir: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Creates the directory (including parents).</p>
<p><strong><code>add_col(&amp;mut self) -&gt; io::Result&lt;PersistentBitVecBuilder&gt;</code></strong></p>
<p>Creates <code>col_NNNNNN.pbiv</code> for the next column and returns its builder. The caller fills the column and calls <code>builder.close()</code> before calling <code>add_col</code> again.</p>
<p><strong><code>close(self) -&gt; io::Result&lt;()&gt;</code></strong></p>
<p>Writes <code>meta.json</code> with the final <code>n</code> and <code>n_cols</code>.</p>
<h3 id="reader-persistentbitmatrix">Reader (<code>PersistentBitMatrix</code>)</h3>
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentBitMatrix</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">cols</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o">&lt;</span><span class="n">PersistentBitVec</span><span class="o">&gt;</span><span class="p">,</span>
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="p">}</span>
</code></pre></div>
<p><strong><code>open(dir: &amp;Path) -&gt; io::Result&lt;Self&gt;</code></strong></p>
<p>Reads <code>meta.json</code>, opens all <code>col_NNNNNN.pbiv</code> files.</p>
<p><strong><code>row(slot: usize) -&gt; Box&lt;[bool]&gt;</code></strong></p>
<p>Returns the presence vector: <code>[col_0[slot], col_1[slot], …, col_{G-1}[slot]]</code>. One byte read per column. O(G).</p>
<p><strong><code>col(c: usize) -&gt; &amp;PersistentBitVec</code></strong></p>
<p>Direct access to a single column for column-oriented operations.</p>
<h3 id="layerdata-implementation">LayerData implementation</h3>
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">LayerData</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentBitMatrix</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Item</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">Box</span><span class="o">&lt;</span><span class="p">[</span><span class="kt">bool</span><span class="p">]</span><span class="o">&gt;</span><span class="p">;</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">open</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&amp;</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">OLMResult</span><span class="o">&lt;</span><span class="bp">Self</span><span class="o">&gt;</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* opens layer_dir/presence/ */</span><span class="w"> </span><span class="p">}</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">read</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nb">Box</span><span class="o">&lt;</span><span class="p">[</span><span class="kt">bool</span><span class="p">]</span><span class="o">&gt;</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">row</span><span class="p">(</span><span class="n">slot</span><span class="p">)</span><span class="w"> </span><span class="p">}</span>
<span class="p">}</span>
</code></pre></div>
<hr />
<h2 id="aggregation-traits-obicompactvectraits">Aggregation traits — <code>obicompactvec::traits</code></h2>
<p><code>PersistentBitMatrix</code> implements two aggregation traits used by <code>LayeredStore&lt;S&gt;</code> for cross-layer and cross-partition distance computations.</p>
<h3 id="columnweights">ColumnWeights</h3>
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">ColumnWeights</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentBitMatrix</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">col_weights</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">Array1</span><span class="o">&lt;</span><span class="kt">u64</span><span class="o">&gt;</span><span class="w"> </span><span class="c1">// = self.count_ones()</span>
<span class="p">}</span>
</code></pre></div>
<p><code>col_weights()[c]</code> = number of set bits in column <code>c</code> across all slots.</p>
<h3 id="bitpartials">BitPartials</h3>
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">BitPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentBitMatrix</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="c1">// Self-contained partials (additive across layers)</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_jaccard</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o">&lt;</span><span class="kt">u64</span><span class="o">&gt;</span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o">&lt;</span><span class="kt">u64</span><span class="o">&gt;</span><span class="p">)</span><span class="w"> </span><span class="c1">// (inter, union)</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hamming</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">Array2</span><span class="o">&lt;</span><span class="kt">u64</span><span class="o">&gt;</span><span class="w"> </span><span class="c1">// differing bits</span>
<span class="w"> </span><span class="c1">// Provided finalisations</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">jaccard_dist_matrix</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">Array2</span><span class="o">&lt;</span><span class="kt">f64</span><span class="o">&gt;</span>
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">hamming_dist_matrix</span><span class="p">(</span><span class="o">&amp;</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nc">Array2</span><span class="o">&lt;</span><span class="kt">u64</span><span class="o">&gt;</span>
<span class="p">}</span>
</code></pre></div>
<p><code>partial_jaccard</code> returns <code>(inter, union)</code> as a pair because <code>union</code> is not reconstructible from per-column <code>count_ones()</code> — it depends on both columns simultaneously. Both components are additively decomposable across <code>(partition, layer)</code> pairs; the final <code>jaccard_dist_matrix()</code> is computed from their element-wise sums.</p>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>