doc/implementation/mphf/index.html


<!doctype html>
<html lang="en" class="no-js">
  <head>
    
      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width,initial-scale=1">
      
      
        <link rel="prev" href="../storage/">
      
      
        <link rel="next" href="../unitig_evidence/">
      
      
      <link rel="icon" href="../../assets/images/favicon.png">
      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
    
    
        <title>MPHF selection - obikmer</title>
      
    
      <link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
      
      
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
      
    
    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
    
      
  </head>
  
  
    <body dir="ltr">
  
    
    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
    <label class="md-overlay" for="__drawer"></label>
    <div data-md-component="skip">
      
        
        <a href="#mphf-selection-two-phase-indexing-architecture" class="md-skip">
          Skip to content
        </a>
      
    </div>
    <div data-md-component="announce">
      
    </div>
    
    
<header class="md-header md-header--shadow" data-md-component="header">
  <nav class="md-header__inner md-grid" aria-label="Header">
    <a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
      
  
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>

    </a>
    <label class="md-header__button md-icon" for="__drawer">
      
      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
    </label>
    <div class="md-header__title" data-md-component="header-title">
      <div class="md-header__ellipsis">
        <div class="md-header__topic">
          <span class="md-ellipsis">
            obikmer
          </span>
        </div>
        <div class="md-header__topic" data-md-component="header-topic">
          <span class="md-ellipsis">
            
              MPHF selection
            
          </span>
        </div>
      </div>
    </div>
    
    
      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
    
    
  </nav>
  
</header>
    
    <div class="md-container" data-md-component="container">
      
      
      <main class="md-main" data-md-component="main">
        <div class="md-main__inner md-grid">
          
            
              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">
                    

<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
  <label class="md-nav__title" for="__drawer">
    <a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
      
  
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>

    </a>
    obikmer
  </label>
  
  <ul class="md-nav__list" data-md-scrollfix>
    
      
    <li class="md-nav__item">
      <a href="../.." class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Home
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
        
          
          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Theory
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_2">
            <span class="md-nav__icon md-icon"></span>
            
  
    Theory
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../../kmers/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmers and super-kmers
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../theory/encoding/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    DNA encoding
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../theory/entropy/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Entropy filter
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../theory/minimizer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Minimizer selection
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../theory/indexing/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Partitioning architecture
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" checked>
        
          
          <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Implementation
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="true">
          <label class="md-nav__title" for="__nav_3">
            <span class="md-nav__icon md-icon"></span>
            
  
    Implementation
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../superkmer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    SuperKmer
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../kmer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmer
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../chunkreader/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Chunk reader
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../pipeline/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Construction pipeline
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../obipipeline/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    obipipeline library
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../storage/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    On-disk storage
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item md-nav__item--active">
      
      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
      
      
        <label class="md-nav__link md-nav__link--active" for="__toc">
          
  
  <span class="md-ellipsis">
    
  
    MPHF selection
  

  </span>
  
  
          <span class="md-nav__icon md-icon"></span>
        </label>
      
      <a href="./" class="md-nav__link md-nav__link--active">
        
  
  <span class="md-ellipsis">
    
  
    MPHF selection
  

  </span>
  
  
      </a>
      
        
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  
  
    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
      
        <li class="md-nav__item">
  <a href="#indexing-architecture" class="md-nav__link">
    <span class="md-ellipsis">
      
        Indexing architecture
      
    </span>
  </a>
  
    <nav class="md-nav" aria-label="Indexing architecture">
      <ul class="md-nav__list">
        
          <li class="md-nav__item">
  <a href="#superkmer-vs-kmer-counts" class="md-nav__link">
    <span class="md-ellipsis">
      
        Superkmer vs kmer counts
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#phase-1-provisional-index-and-spectrum" class="md-nav__link">
    <span class="md-ellipsis">
      
        Phase 1 — provisional index and spectrum
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#phase-2-definitive-index" class="md-nav__link">
    <span class="md-ellipsis">
      
        Phase 2 — definitive index
      
    </span>
  </a>
  
</li>
        
      </ul>
    </nav>
  
</li>
      
        <li class="md-nav__item">
  <a href="#candidates" class="md-nav__link">
    <span class="md-ellipsis">
      
        Candidates
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#mphf-choice-per-phase" class="md-nav__link">
    <span class="md-ellipsis">
      
        MPHF choice per phase
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#space-at-scale" class="md-nav__link">
    <span class="md-ellipsis">
      
        Space at scale
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#on-disk-and-mmap-considerations" class="md-nav__link">
    <span class="md-ellipsis">
      
        On-disk and mmap considerations
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#multilayer-index-architecture" class="md-nav__link">
    <span class="md-ellipsis">
      
        Multilayer index architecture
      
    </span>
  </a>
  
    <nav class="md-nav" aria-label="Multilayer index architecture">
      <ul class="md-nav__list">
        
          <li class="md-nav__item">
  <a href="#motivation" class="md-nav__link">
    <span class="md-ellipsis">
      
        Motivation
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#layer-structure" class="md-nav__link">
    <span class="md-ellipsis">
      
        Layer structure
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#membership-verification" class="md-nav__link">
    <span class="md-ellipsis">
      
        Membership verification
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#query-algorithm" class="md-nav__link">
    <span class="md-ellipsis">
      
        Query algorithm
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#layer-count-and-probe-cost" class="md-nav__link">
    <span class="md-ellipsis">
      
        Layer count and probe cost
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#merging-layers" class="md-nav__link">
    <span class="md-ellipsis">
      
        Merging layers
      
    </span>
  </a>
  
</li>
        
      </ul>
    </nav>
  
</li>
      
        <li class="md-nav__item">
  <a href="#open-questions" class="md-nav__link">
    <span class="md-ellipsis">
      
        Open questions
      
    </span>
  </a>
  
</li>
      
    </ul>
  
</nav>
      
    </li>
  

    <li class="md-nav__item">
      <a href="../unitig_evidence/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Unitig evidence encoding
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../obilayeredmap/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    obilayeredmap crate
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../persistent_compact_int_vec/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    PersistentCompactIntVec
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../persistent_bit_vec/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    PersistentBitVec
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

    <li class="md-nav__item md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
        
          
          <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Architecture
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_4">
            <span class="md-nav__icon md-icon"></span>
            
  
    Architecture
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../../architecture/sequences/invariant/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Sequences
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../architecture/index_architecture/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmer index
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

  </ul>
</nav>
                  </div>
                </div>
              </div>
            
            
              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">
                    

<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  
  
    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
      
        <li class="md-nav__item">
  <a href="#indexing-architecture" class="md-nav__link">
    <span class="md-ellipsis">
      
        Indexing architecture
      
    </span>
  </a>
  
    <nav class="md-nav" aria-label="Indexing architecture">
      <ul class="md-nav__list">
        
          <li class="md-nav__item">
  <a href="#superkmer-vs-kmer-counts" class="md-nav__link">
    <span class="md-ellipsis">
      
        Superkmer vs kmer counts
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#phase-1-provisional-index-and-spectrum" class="md-nav__link">
    <span class="md-ellipsis">
      
        Phase 1 — provisional index and spectrum
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#phase-2-definitive-index" class="md-nav__link">
    <span class="md-ellipsis">
      
        Phase 2 — definitive index
      
    </span>
  </a>
  
</li>
        
      </ul>
    </nav>
  
</li>
      
        <li class="md-nav__item">
  <a href="#candidates" class="md-nav__link">
    <span class="md-ellipsis">
      
        Candidates
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#mphf-choice-per-phase" class="md-nav__link">
    <span class="md-ellipsis">
      
        MPHF choice per phase
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#space-at-scale" class="md-nav__link">
    <span class="md-ellipsis">
      
        Space at scale
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#on-disk-and-mmap-considerations" class="md-nav__link">
    <span class="md-ellipsis">
      
        On-disk and mmap considerations
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#multilayer-index-architecture" class="md-nav__link">
    <span class="md-ellipsis">
      
        Multilayer index architecture
      
    </span>
  </a>
  
    <nav class="md-nav" aria-label="Multilayer index architecture">
      <ul class="md-nav__list">
        
          <li class="md-nav__item">
  <a href="#motivation" class="md-nav__link">
    <span class="md-ellipsis">
      
        Motivation
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#layer-structure" class="md-nav__link">
    <span class="md-ellipsis">
      
        Layer structure
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#membership-verification" class="md-nav__link">
    <span class="md-ellipsis">
      
        Membership verification
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#query-algorithm" class="md-nav__link">
    <span class="md-ellipsis">
      
        Query algorithm
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#layer-count-and-probe-cost" class="md-nav__link">
    <span class="md-ellipsis">
      
        Layer count and probe cost
      
    </span>
  </a>
  
</li>
        
          <li class="md-nav__item">
  <a href="#merging-layers" class="md-nav__link">
    <span class="md-ellipsis">
      
        Merging layers
      
    </span>
  </a>
  
</li>
        
      </ul>
    </nav>
  
</li>
      
        <li class="md-nav__item">
  <a href="#open-questions" class="md-nav__link">
    <span class="md-ellipsis">
      
        Open questions
      
    </span>
  </a>
  
</li>
      
    </ul>
  
</nav>
                  </div>
                </div>
              </div>
            
          
            <div class="md-content" data-md-component="content">
              
              <article class="md-content__inner md-typeset">
                
                  
<h1 id="mphf-selection-two-phase-indexing-architecture">MPHF selection — two-phase indexing architecture</h1>
<h2 id="indexing-architecture">Indexing architecture</h2>
<p>Kmer indexing per partition proceeds in two phases. The separation is necessary because the exact number of unique kmers in a partition is not known until after counting and filtering.</p>
<h3 id="superkmer-vs-kmer-counts">Superkmer vs kmer counts</h3>
<p>The <code>SKFileMeta</code> sidecar written by <code>SKFileWriter</code> records <code>instances</code> (unique superkmers) and <code>length_sum</code> (total nucleotides). A superkmer of length L contains L − k + 1 kmers, so the kmer count per partition can be estimated as <code>length_sum − instances × (k − 1)</code>. This is an <strong>overestimate</strong> of unique kmers: two distinct superkmers (different flanking contexts, same minimizer) can share kmers. The exact count of unique kmers is only known after enumerating and deduplicating them.</p>
<p>Note: two superkmers sharing a kmer necessarily share the same minimizer and therefore always land in the same partition — no kmer can appear in two different partitions.</p>
<h3 id="phase-1-provisional-index-and-spectrum">Phase 1 — provisional index and spectrum</h3>
<ol>
<li>Enumerate all kmers from the dereplicated superkmers of the partition.</li>
<li>Build a provisional MPHF over this key set; capacity is pre-allocated from the sidecar estimate (slight overestimate, harmless).</li>
<li>Accumulate counts: for each kmer in each superkmer, <code>count[MPHF(kmer)] += sk.count()</code>.</li>
<li>Compute the kmer frequency spectrum (histogram: occurrences → number of kmers).</li>
<li>Apply count filter (e.g. discard singletons). After filtering, the exact number of surviving kmers is known.</li>
<li>Discard the provisional MPHF.</li>
</ol>
<h3 id="phase-2-definitive-index">Phase 2 — definitive index</h3>
<p>Build a new MPHF over the filtered kmer set only, with the exact key count available. This is the persistent per-partition index used for all downstream operations (queries, set operations).</p>
<hr />
<h2 id="candidates">Candidates</h2>
<p><strong>boomphf</strong> (BBHash algorithm, maintained by 10X Genomics):</p>
<ul>
<li>~3.7 bits/key; mature crate, used in production bioinformatics (Pufferfish, Piscem)</li>
<li>Parallel construction; well-tested with DNA kmer data at scale</li>
<li>Drawback: largest space footprint; streaming construction (no exact count needed) was its main differentiator — irrelevant here since exact count is available at phase 2</li>
</ul>
<p><strong>ptr_hash</strong> (PtrHash algorithm, Groot Koerkamp, SEA 2025):</p>
<ul>
<li>~2.4 bits/key; fastest queries (≥2.1× over alternatives, 8–12 ns/key for u64 in tight loops) and fastest construction (≥3.1×)</li>
<li>Requires exact key count at construction — available at phase 2</li>
<li>Drawback: published February 2025 — very young, no production track record</li>
</ul>
<p><strong>FMPHGO</strong> (<code>ph</code> crate, Beling, ACM JEA 2023):</p>
<ul>
<li>~2.1 bits/key — most compact of the three; good query speed; parallelisable construction</li>
<li>More established than ptr_hash; actively maintained</li>
<li>Works well with overestimated capacity → natural fit for phase 1</li>
</ul>
<h2 id="mphf-choice-per-phase">MPHF choice per phase</h2>
<p><strong>Phase 1</strong> (provisional, discarded after spectrum computation): FMPHGO. Tolerates overestimated capacity, compact, no need to optimise for query speed on a temporary structure.</p>
<p><strong>Phase 2</strong> (persistent, queried repeatedly): <strong>ptr_hash</strong>. Exact key count is available at phase 2, so ptr_hash operates optimally. Its query speed (≥2.1× over FMPHGO) and construction speed (≥3.1×) are meaningful for the persistent index; the space overhead at 2.4 bits/key is acceptable. The crate's youth (Feb 2025) was previously a concern; it is now accepted given the performance profile and the fact that each layer MPHF is independently rebuildable from its unitig file if needed.</p>
<p>boomphf is effectively eliminated: its space overhead is the largest and its streaming-construction advantage does not apply here.</p>
<hr />
<h2 id="space-at-scale">Space at scale</h2>
<p>For 1 024 partitions × 100 M kmers/partition (phase 2 index, after filtering):</p>
<table>
<thead>
<tr>
<th>MPHF</th>
<th>bits/key</th>
<th>Total MPHF size</th>
</tr>
</thead>
<tbody>
<tr>
<td>boomphf</td>
<td>3.7</td>
<td>~47 GB</td>
</tr>
<tr>
<td>ptr_hash</td>
<td>2.4</td>
<td>~31 GB</td>
</tr>
<tr>
<td>FMPHGO</td>
<td>2.1</td>
<td>~27 GB</td>
</tr>
</tbody>
</table>
<p>For a human genome at 30× coverage with 1 024 partitions, realistic partition sizes are 3–30 M unique kmers → 1–8 MB per phase-2 MPHF, well within RAM.</p>
<h2 id="on-disk-and-mmap-considerations">On-disk and mmap considerations</h2>
<p>All three are in-memory structures. Their internal representation is flat bit arrays (no heap pointers), making them serialisable as contiguous byte blobs and mmappable per partition. True zero-copy access would require rkyv integration; the <code>ph</code> crate currently uses serde, so loading involves a copy. Given per-partition MPHF sizes of 1–8 MB, the OS page cache handles this transparently — strict zero-copy is a refinement, not a blocker.</p>
<p>No established Rust crate provides a natively on-disk MPHF. <strong>SSHash</strong> (Sparse and Skew Hash) is a complete kmer dictionary designed for disk access and is order-preserving (overlapping kmers receive consecutive indices → cache-friendly count access), but it is C++-only and covers more than just the MPHF layer.</p>
<hr />
<h2 id="multilayer-index-architecture">Multilayer index architecture</h2>
<h3 id="motivation">Motivation</h3>
<p>An index built from a single dataset A can be extended with a new dataset B without rebuilding. This supports incremental construction (adding species, samples, or sequencing runs) and enables set operations across heterogeneous sources.</p>
<h3 id="layer-structure">Layer structure</h3>
<p>Each layer is a self-contained unit:</p>
<div class="highlight"><pre><span></span><code>layer_i/
  unitigs.bin     — packed 2-bit nucleotide sequences
  mphf.bin        — ptr_hash index (phase-2, exact key count)
  evidence.bin    — [(unitig_id, rank)] per MPHF slot  (see unitig_evidence.md)
  counts.bin      — [u32] per MPHF slot
</code></pre></div>
<p>Layers are <strong>disjoint</strong>: a canonical kmer belongs to exactly one layer. Layer 0 is built from dataset A. Adding dataset B proceeds as follows:</p>
<ol>
<li>For each kmer in B: query layer 0 — if found, accumulate count into <code>counts_0[MPHF_0(kmer)]</code>.</li>
<li>Collect all kmers of B not present in any existing layer → set <code>B \ A</code>.</li>
<li>Build layer 1 from <code>B \ A</code> using the standard two-phase pipeline (spectrum, filter, ptr_hash).</li>
</ol>
<p>Adding a third dataset C repeats the process: probe layer 0, then layer 1, then build layer 2 from <code>C \ A \ B</code>.</p>
<h3 id="membership-verification">Membership verification</h3>
<p>ptr_hash maps any input to a valid slot — it does not natively detect absent keys. Membership is verified using the evidence entry: decode the kmer from <code>(unitig_id, rank)</code> and compare to the query. A mismatch means the kmer is absent from this layer; probe the next layer.</p>
<p>This makes the evidence layer load-bearing for correctness, not only for locality.</p>
<h3 id="query-algorithm">Query algorithm</h3>
<div class="highlight"><pre><span></span><code>fn query(kmer) → Option&lt;count&gt;:
    for layer in layers:
        slot = layer.mphf.query(kmer)
        if layer.evidence.decode(slot) == kmer:
            return Some(layer.counts[slot])
    return None
</code></pre></div>
<p>Expected probe depth: 1 for kmers present in layer 0, increasing for rare kmers added in later layers. In practice, the dominant dataset (largest A) should be layer 0 to minimise average probe depth.</p>
<h3 id="layer-count-and-probe-cost">Layer count and probe cost</h3>
<p>Each probe is a ptr_hash lookup (~10 ns) plus one evidence decode (two array accesses). For L layers the worst case is L probes + 1 None. In practice L is small (2–5 for typical multi-species databases). No global data structure is needed to route queries; the layer chain is traversed in order.</p>
<h3 id="merging-layers">Merging layers</h3>
<p>Two layer chains can be merged by re-indexing their union through the standard pipeline. This is expensive (full rebuild) but produces an optimal single-layer index. Merge is a maintenance operation, not a query-path requirement.</p>
<h2 id="open-questions">Open questions</h2>
<ul>
<li>Confirm actual partition sizes and overestimation factor on representative metagenomic datasets.</li>
<li><strong>rkyv integration</strong>: all flat arrays in a layer (evidence, counts, presence/absence matrix) map trivially to <code>rkyv::Archive</code> — fixed-size element types, no heap indirection. The presence/absence matrix is the strongest case: at 10 M kmers × 1 000 samples ≈ 1.25 GB per partition, zero-copy mmap via rkyv avoids loading the entire matrix at open time, letting the OS page cache serve only accessed pages. ptr_hash itself is internally a flat bit array and is structurally compatible with rkyv, but requires either native crate support or a wrapper. Assess the wrapper cost and whether ptr_hash is willing to adopt rkyv upstream.</li>
<li>Keep SSHash in mind if the indexing architecture is reconsidered at a higher level.</li>
<li>Determine optimal layer ordering heuristic (by kmer count? by query frequency?) for multi-species databases.</li>
</ul>


              </article>
            </div>
          
          
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
        </div>
        
      </main>
      
        <footer class="md-footer">
  
  <div class="md-footer-meta md-typeset">
    <div class="md-footer-meta__inner md-grid">
      <div class="md-copyright">
  
  
    Made with
    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
      Material for MkDocs
    </a>
  
</div>
      
    </div>
  </div>
</footer>
      
    </div>
    <div class="md-dialog" data-md-component="dialog">
      <div class="md-dialog__inner md-typeset"></div>
    </div>
    
    
      <script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
    
    
      <script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
      
        <script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
      
    
  </body>
</html>