f36b095ce2
Formalize the two-phase MPHF indexing architecture and update Phase 6 to use `evidence.bin` for direct kmer extraction. Simplify the evidence and unitig storage layouts to flat packed formats enabling O(1) random access. Introduce aggregation traits (`ColumnWeights`, `CountPartials`, `BitPartials`) to support additive distance metric decomposition across partitions. Narrow the documented scope from metagenomic to individual genome datasets, and replace speculative open questions with concrete implementation specifications.
1708 lines
54 KiB
HTML
1708 lines
54 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
<link rel="prev" href="../obilayeredmap/">
|
||
|
||
|
||
<link rel="next" href="../persistent_bit_vec/">
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>PersistentCompactIntVec - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#persistentcompactintvec-and-persistentcompactintmatrix" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
PersistentCompactIntVec
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../pipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../mphf/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../obilayeredmap/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obilayeredmap crate
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--active">
|
||
|
||
<input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__link md-nav__link--active" for="__toc">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<a href="./" class="md-nav__link md-nav__link--active">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#purpose" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Purpose
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#persistentcompactintvec-single-column-file" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
PersistentCompactIntVec — single-column file
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="PersistentCompactIntVec — single-column file">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#design" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Design
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#file-format" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
File format
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#lifecycle" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Lifecycle
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Lifecycle">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#builder-persistentcompactintvecbuilder" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Builder (PersistentCompactIntVecBuilder)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reader-persistentcompactintvec" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Reader (PersistentCompactIntVec)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#step-computation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Step computation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#complexity" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Complexity
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#persistentcompactintmatrix-column-major-directory" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
PersistentCompactIntMatrix — column-major directory
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="PersistentCompactIntMatrix — column-major directory">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#design_1" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Design
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#builder-persistentcompactintmatrixbuilder" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Builder (PersistentCompactIntMatrixBuilder)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reader-persistentcompactintmatrix" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Reader (PersistentCompactIntMatrix)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#layerdata-implementation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
LayerData implementation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Aggregation traits — obicompactvec::traits
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Aggregation traits — obicompactvec::traits">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#columnweights" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
ColumnWeights
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#countpartials" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
CountPartials
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../persistent_bit_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentBitVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../architecture/index_architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer index
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#purpose" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Purpose
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#persistentcompactintvec-single-column-file" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
PersistentCompactIntVec — single-column file
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="PersistentCompactIntVec — single-column file">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#design" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Design
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#file-format" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
File format
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#lifecycle" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Lifecycle
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Lifecycle">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#builder-persistentcompactintvecbuilder" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Builder (PersistentCompactIntVecBuilder)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reader-persistentcompactintvec" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Reader (PersistentCompactIntVec)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#step-computation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Step computation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#complexity" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Complexity
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#persistentcompactintmatrix-column-major-directory" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
PersistentCompactIntMatrix — column-major directory
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="PersistentCompactIntMatrix — column-major directory">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#design_1" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Design
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#builder-persistentcompactintmatrixbuilder" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Builder (PersistentCompactIntMatrixBuilder)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#reader-persistentcompactintmatrix" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Reader (PersistentCompactIntMatrix)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#layerdata-implementation" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
LayerData implementation
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#aggregation-traits-obicompactvectraits" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Aggregation traits — obicompactvec::traits
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Aggregation traits — obicompactvec::traits">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#columnweights" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
ColumnWeights
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#countpartials" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
CountPartials
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="persistentcompactintvec-and-persistentcompactintmatrix">PersistentCompactIntVec and PersistentCompactIntMatrix</h1>
|
||
<h2 id="purpose">Purpose</h2>
|
||
<p><code>PersistentCompactIntVec</code> stores a dense array of non-negative integers indexed by MPHF slot where the vast majority of values are small (0–254) and large values are rare. It is designed for mmap-compatible random and sequential access with minimal memory footprint and optimal cache behaviour.</p>
|
||
<p>Motivation from observed count distributions in genomics data: 99.9% of k-mer counts fit in a u8; overflow (count ≥ 255) affects ~0.07% of distinct k-mers but can reach values above 10⁶ (chloroplast, ribosomal repeats).</p>
|
||
<p><code>PersistentCompactIntMatrix</code> wraps multiple <code>PersistentCompactIntVec</code> columns in a directory, exposing a column-major matrix with row-access API. A vector is a matrix with 1 column.</p>
|
||
<hr />
|
||
<h2 id="persistentcompactintvec-single-column-file">PersistentCompactIntVec — single-column file</h2>
|
||
<h3 id="design">Design</h3>
|
||
<p>Two-tier structure:</p>
|
||
<ol>
|
||
<li><strong>Primary array</strong> — <code>[u8; n]</code>, stored at offset 40 in the PCIV file and mmap'd. Values 0–254 are stored directly. Value <strong>255 is a sentinel</strong> meaning "look in overflow".</li>
|
||
<li><strong>Overflow section</strong> — sorted list of <code>(slot: u64, value: u32)</code> pairs for all slots where the true value ≥ 255, with a <strong>sparse L1-fitting index</strong> for fast lookup.</li>
|
||
</ol>
|
||
<div class="highlight"><pre><span></span><code>primary[slot] < 255 → return primary[slot]
|
||
primary[slot] == 255 → binary search in overflow
|
||
</code></pre></div>
|
||
<h3 id="file-format">File format</h3>
|
||
<p>Single <code>.pciv</code> file. Write order: header placeholder → primary → overflow + index → header overwrite at offset 0.</p>
|
||
<div class="highlight"><pre><span></span><code>offset 0:
|
||
magic: [u8; 4] = b"PCIV"
|
||
_pad: [u8; 4] = 0
|
||
n: u64 number of slots
|
||
n_overflow: u64 number of overflow entries
|
||
n_index: u64 number of sparse index entries
|
||
step: u64 sparse index step (0 = no index)
|
||
|
||
offset 40:
|
||
primary: [u8; n] one byte per slot, 255 = overflow sentinel
|
||
|
||
offset 40 + n:
|
||
data: [(slot: u64, value: u32); n_overflow] 12 bytes each, sorted by slot
|
||
|
||
offset 40 + n + n_overflow × 12:
|
||
index: [(slot: u64, pos: u64); n_index] 16 bytes each, sparse index
|
||
</code></pre></div>
|
||
<p>The index entries point into <code>data</code>: <code>index[i] = (slot of data[i×step], i×step)</code>.</p>
|
||
<p>All integer fields are little-endian. Slot indices are stored as <code>u64</code> in the file; they are <code>usize</code> in Rust code.</p>
|
||
<h3 id="lifecycle">Lifecycle</h3>
|
||
<h4 id="builder-persistentcompactintvecbuilder">Builder (<code>PersistentCompactIntVecBuilder</code>)</h4>
|
||
<p>Used during construction. The primary section is <strong>mmap'd immediately</strong> at construction time (both for <code>new</code> and <code>build_from</code>), so the file exists and is addressable from the start. The overflow is held in a <code>HashMap<usize, u32></code> in RAM.</p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentCompactIntVecBuilder</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">path</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">mmap</span><span class="p">:</span><span class="w"> </span><span class="nc">MmapMut</span><span class="p">,</span><span class="w"> </span><span class="c1">// primary section live in the file from the start</span>
|
||
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">overflow</span><span class="p">:</span><span class="w"> </span><span class="nc">HashMap</span><span class="o"><</span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="kt">u32</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// values ≥ 255</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><strong><code>new(n: usize, path: &Path) -> io::Result<Self></code></strong></p>
|
||
<p>Creates the file, pre-allocates <code>HEADER_SIZE + n</code> zero bytes, mmaps it. The primary is zero-initialised (all slots = 0). Returns immediately ready for <code>set</code> / <code>get</code>.</p>
|
||
<p><strong><code>build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self></code></strong></p>
|
||
<p>Copies the source PCIV file to <code>path</code> (OS-level copy — no per-slot iteration), mmaps the copy, then loads the overflow section into a <code>HashMap</code>. Initialisation cost: O(file copy) + O(n_overflow), not O(n).</p>
|
||
<p>At <code>close()</code>, the primary section is <strong>not rewritten</strong>: it is already in the file via mmap. Only the overflow data, the sparse index, and the header are updated.</p>
|
||
<p><strong><code>set(slot: usize, value: u32)</code> / <code>get(slot: usize) -> u32</code></strong></p>
|
||
<p>Direct mmap byte access for the primary; HashMap for the overflow. Both O(1). Mutations can move a slot between tiers freely (downward mutation removes the HashMap entry; upward mutation adds it).</p>
|
||
<p><strong>Element-wise operations — <code>min</code>, <code>max</code>, <code>add</code>, <code>diff</code></strong></p>
|
||
<p>Each takes a <code>&PersistentCompactIntVec</code> of equal length and updates <code>self</code> in place via <code>set</code>:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="n">builder</span><span class="p">.</span><span class="n">min</span><span class="p">(</span><span class="o">&</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] = min(self[i], other[i])</span>
|
||
<span class="n">builder</span><span class="p">.</span><span class="n">max</span><span class="p">(</span><span class="o">&</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] = max(self[i], other[i])</span>
|
||
<span class="n">builder</span><span class="p">.</span><span class="n">add</span><span class="p">(</span><span class="o">&</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] = self[i].checked_add(other[i]) (panics on u32 overflow)</span>
|
||
<span class="n">builder</span><span class="p">.</span><span class="n">diff</span><span class="p">(</span><span class="o">&</span><span class="n">other</span><span class="p">);</span><span class="w"> </span><span class="c1">// self[i] = self[i].saturating_sub(other[i])</span>
|
||
</code></pre></div>
|
||
<p>All iterate <code>other</code> with <code>other.iter()</code> (merge-scan, O(n_other)).</p>
|
||
<p><strong><code>close(self) -> io::Result<()></code></strong></p>
|
||
<ol>
|
||
<li>Flush and drop the mmap (primary changes are now on disk).</li>
|
||
<li>Sort the overflow HashMap into <code>Vec<(usize, u32)></code>.</li>
|
||
<li>Truncate the file to <code>HEADER_SIZE + n</code> (removes old data+index if <code>build_from</code> was used).</li>
|
||
<li>Append sorted overflow data, then sparse index.</li>
|
||
<li>Seek to offset 0, overwrite the header with final values.</li>
|
||
</ol>
|
||
<h4 id="reader-persistentcompactintvec">Reader (<code>PersistentCompactIntVec</code>)</h4>
|
||
<p>Used at query time. The whole file is mmap'd; only the sparse index is copied into a <code>Vec</code> at open time (≤ 32 KB, L1-resident).</p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentCompactIntVec</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">mmap</span><span class="p">:</span><span class="w"> </span><span class="nc">Mmap</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n_overflow</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">step</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">index</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="p">(</span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="c1">// (slot, pos) — L1-resident</span>
|
||
<span class="w"> </span><span class="n">primary_offset</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="c1">// = 40 (HEADER_SIZE)</span>
|
||
<span class="w"> </span><span class="n">data_offset</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span><span class="w"> </span><span class="c1">// = 40 + n</span>
|
||
<span class="w"> </span><span class="n">path</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><strong><code>open(path: &Path) -> io::Result<Self></code></strong></p>
|
||
<p>Mmaps the file, parses the 40-byte header, copies the sparse index entries into a <code>Vec</code>. The primary and data sections stay mmap'd.</p>
|
||
<p><strong><code>get(slot: usize) -> u32</code> — random access</strong></p>
|
||
<div class="highlight"><pre><span></span><code>primary[slot] < 255 → return it directly
|
||
|
||
step == 0:
|
||
binary_search(data[0..n_overflow], slot)
|
||
|
||
step > 0:
|
||
i = upper_bound(index[..].slot, slot) − 1 // in L1-resident Vec
|
||
binary_search(data[index[i].pos .. index[i+1].pos], slot)
|
||
</code></pre></div>
|
||
<p><strong><code>iter() -> Iter<'_></code> — sequential scan, O(n)</strong></p>
|
||
<p>Merge-scan: reads primary bytes in order; on sentinel 255, advances a sequential pointer into the sorted data section rather than doing a binary search. This gives O(n + n_overflow) with no random access into the data section.</p>
|
||
<p><code>Iter</code> implements <code>ExactSizeIterator</code>. <code>&PersistentCompactIntVec</code> implements <code>IntoIterator</code>.</p>
|
||
<p><strong>Aggregate</strong></p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">sum</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="kt">u64</span><span class="w"> </span><span class="c1">// Σ self[i] as u64, via iter()</span>
|
||
</code></pre></div>
|
||
<p><strong>Distance methods</strong></p>
|
||
<p>All take <code>&other</code> of equal length, iterate both with <code>zip(self.iter(), other.iter())</code>, and return <code>f64</code>.</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>Method</th>
|
||
<th>Formula</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>bray_dist</code></td>
|
||
<td><code>1 − 2·Σmin(aᵢ,bᵢ) / (Σaᵢ + Σbᵢ)</code></td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>relfreq_bray_dist</code></td>
|
||
<td>Bray-Curtis on relative frequencies: <code>1 − Σmin(pᵢ,qᵢ)</code> where <code>pᵢ = aᵢ/Σa</code></td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>euclidean_dist</code></td>
|
||
<td><code>√Σ(aᵢ − bᵢ)²</code></td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>relfreq_euclidean_dist</code></td>
|
||
<td>Euclidean on relative frequencies</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>hellinger_euclidean_dist</code></td>
|
||
<td><code>√Σ(√pᵢ − √qᵢ)²</code> — Euclidean on sqrt(relfreq)</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>hellinger_dist</code></td>
|
||
<td><code>hellinger_euclidean_dist / √2</code> — standard Hellinger distance ∈ [0, 1]</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>threshold_jaccard_dist(&other, threshold: u32)</code></td>
|
||
<td><code>1 − \|A∩B\| / \|A∪B\|</code> where presence iff count ≥ threshold</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>jaccard_dist</code></td>
|
||
<td><code>threshold_jaccard_dist(&other, 1)</code></td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p>Edge cases (both vectors all-zero, or union empty for Jaccard): distance = 0.0.</p>
|
||
<h3 id="step-computation">Step computation</h3>
|
||
<p>Chosen at <code>close()</code> once <code>n_overflow</code> is known:</p>
|
||
<div class="highlight"><pre><span></span><code>L1_INDEX_ENTRIES = 2048
|
||
|
||
step = 0 if n_overflow ≤ 2048
|
||
step = ⌈n_overflow / 2048⌉ otherwise
|
||
</code></pre></div>
|
||
<h3 id="complexity">Complexity</h3>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>Operation</th>
|
||
<th>Time</th>
|
||
<th>Notes</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>set</code> / <code>get</code> (builder)</td>
|
||
<td>O(1)</td>
|
||
<td>mmap byte + HashMap</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>get</code> (reader, no overflow)</td>
|
||
<td>O(1)</td>
|
||
<td>single mmap byte</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>get</code> (reader, with index)</td>
|
||
<td>O(log step)</td>
|
||
<td>≤ 2 memory regions</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>get</code> (reader, no index)</td>
|
||
<td>O(log n_overflow)</td>
|
||
<td>data fits in a few cache lines</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>iter()</code> full scan</td>
|
||
<td>O(n + n_overflow)</td>
|
||
<td>merge-scan, no binary search</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>sum</code>, distances</td>
|
||
<td>O(n)</td>
|
||
<td>via <code>iter()</code> / <code>zip(iter(), iter())</code></td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>min</code> / <code>max</code> / <code>add</code> / <code>diff</code></td>
|
||
<td>O(n)</td>
|
||
<td>via <code>other.iter()</code> + builder <code>set</code></td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>close</code></td>
|
||
<td>O(n_overflow log n_overflow)</td>
|
||
<td>sort + sequential write</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>open</code></td>
|
||
<td>O(n_index)</td>
|
||
<td>index copy into Vec</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>build_from</code></td>
|
||
<td>O(file_size) + O(n_overflow)</td>
|
||
<td>OS copy + HashMap load</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr />
|
||
<h2 id="persistentcompactintmatrix-column-major-directory">PersistentCompactIntMatrix — column-major directory</h2>
|
||
<h3 id="design_1">Design</h3>
|
||
<p>A directory containing <code>meta.json</code> and N column files <code>col_000000.pciv</code>, <code>col_000001.pciv</code>, …, each a <code>PersistentCompactIntVec</code>. This is the type used by <code>LayerData</code> — a single-column matrix is functionally equivalent to a vector but shares the same interface as multi-column matrices.</p>
|
||
<div class="highlight"><pre><span></span><code>counts/
|
||
meta.json {"n": <n_slots>, "n_cols": <N>}
|
||
col_000000.pciv
|
||
col_000001.pciv
|
||
...
|
||
</code></pre></div>
|
||
<h3 id="builder-persistentcompactintmatrixbuilder">Builder (<code>PersistentCompactIntMatrixBuilder</code>)</h3>
|
||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentCompactIntMatrixBuilder</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">dir</span><span class="p">:</span><span class="w"> </span><span class="nc">PathBuf</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n_cols</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><strong><code>new(n: usize, dir: &Path) -> io::Result<Self></code></strong></p>
|
||
<p>Creates the directory (including parents). Does not write <code>meta.json</code> yet.</p>
|
||
<p><strong><code>add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder></code></strong></p>
|
||
<p>Creates <code>col_NNNNNN.pciv</code> for the next column and returns its builder. The caller fills the column and calls <code>builder.close()</code> before calling <code>add_col</code> again.</p>
|
||
<p><strong><code>close(self) -> io::Result<()></code></strong></p>
|
||
<p>Writes <code>meta.json</code> with the final <code>n</code> and <code>n_cols</code>. Must be called after all column builders are closed.</p>
|
||
<h3 id="reader-persistentcompactintmatrix">Reader (<code>PersistentCompactIntMatrix</code>)</h3>
|
||
<div class="highlight"><pre><span></span><code><span class="k">struct</span><span class="w"> </span><span class="nc">PersistentCompactIntMatrix</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">cols</span><span class="p">:</span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="n">PersistentCompactIntVec</span><span class="o">></span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><strong><code>open(dir: &Path) -> io::Result<Self></code></strong></p>
|
||
<p>Reads <code>meta.json</code>, opens all <code>col_NNNNNN.pciv</code> files.</p>
|
||
<p><strong><code>row(slot: usize) -> Box<[u32]></code></strong></p>
|
||
<p>Returns the full row: <code>[col_0[slot], col_1[slot], …, col_{N-1}[slot]]</code>. One mmap access per column. O(N).</p>
|
||
<p><strong><code>col(c: usize) -> &PersistentCompactIntVec</code></strong></p>
|
||
<p>Direct access to a single column for column-oriented operations (distance computations, iteration).</p>
|
||
<h3 id="layerdata-implementation">LayerData implementation</h3>
|
||
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">LayerData</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentCompactIntMatrix</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="k">type</span><span class="w"> </span><span class="nc">Item</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="nb">Box</span><span class="o"><</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">></span><span class="p">;</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">open</span><span class="p">(</span><span class="n">layer_dir</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Path</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">OLMResult</span><span class="o"><</span><span class="bp">Self</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="cm">/* opens layer_dir/counts/ */</span><span class="w"> </span><span class="p">}</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">read</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">slot</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Box</span><span class="o"><</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">></span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="bp">self</span><span class="p">.</span><span class="n">row</span><span class="p">(</span><span class="n">slot</span><span class="p">)</span><span class="w"> </span><span class="p">}</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<hr />
|
||
<h2 id="aggregation-traits-obicompactvectraits">Aggregation traits — <code>obicompactvec::traits</code></h2>
|
||
<p><code>PersistentCompactIntMatrix</code> implements two aggregation traits used by <code>LayeredStore<S></code> for cross-layer and cross-partition distance computations.</p>
|
||
<h3 id="columnweights">ColumnWeights</h3>
|
||
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">ColumnWeights</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentCompactIntMatrix</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">col_weights</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="w"> </span><span class="c1">// = self.sum()</span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><code>col_weights()[c]</code> = sum of all values in column <code>c</code> across all slots.</p>
|
||
<h3 id="countpartials">CountPartials</h3>
|
||
<div class="highlight"><pre><span></span><code><span class="k">impl</span><span class="w"> </span><span class="n">CountPartials</span><span class="w"> </span><span class="k">for</span><span class="w"> </span><span class="n">PersistentCompactIntMatrix</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="c1">// Self-contained partials (additive across layers, no external parameter)</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_threshold_jaccard</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">threshold</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="p">(</span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">,</span><span class="w"> </span><span class="n">Array2</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span>
|
||
|
||
<span class="w"> </span><span class="c1">// Normalised partials (require global col_weights across all layers/partitions)</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_bray</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_relfreq_euclidean</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">partial_hellinger</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">global</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="nc">Array1</span><span class="o"><</span><span class="kt">u64</span><span class="o">></span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
|
||
<span class="w"> </span><span class="c1">// Provided finalisations (default implementations on the trait)</span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">euclidean_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">threshold_jaccard_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">,</span><span class="w"> </span><span class="n">threshold</span><span class="p">:</span><span class="w"> </span><span class="kt">u32</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_bray_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">relfreq_euclidean_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="w"> </span><span class="k">fn</span><span class="w"> </span><span class="nf">hellinger_dist_matrix</span><span class="p">(</span><span class="o">&</span><span class="bp">self</span><span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nc">Array2</span><span class="o"><</span><span class="kt">f64</span><span class="o">></span>
|
||
<span class="p">}</span>
|
||
</code></pre></div>
|
||
<p><strong>Self-contained partials</strong> are additively decomposable: summing <code>partial_bray()</code> across all <code>(partition, layer)</code> pairs and finalising gives the same result as computing on the combined data.</p>
|
||
<p><strong>Normalised partials</strong> require the global column weights (sum across all layers and all partitions). The <code>global</code> parameter must reflect the complete index, not a per-layer sum. The provided <code>relfreq_bray_dist_matrix()</code> etc. call <code>col_weights()</code> first (pass 1) then the normalised partial (pass 2); when called on a <code>LayeredStore<LayeredStore<…>></code> these two-pass calls cascade automatically through the blanket impls.</p>
|
||
<p><strong><code>partial_bray</code> returns <code>Array2<u64></code></strong> (sum_min only, not a tuple). The denominator is always reconstructible as <code>col_weights()[i] + col_weights()[j]</code>.</p>
|
||
<p><strong><code>partial_threshold_jaccard</code> returns <code>(inter, union)</code></strong> as a pair because <code>union[i,j]</code> is not reconstructible from per-column statistics — it depends on both columns simultaneously.</p>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |