bb7adc1154
Expands MkDocs navigation and documentation for evidence elimination, the merge command, and kmer filtering. Refactors kmer representation to a generic `KmerOf<L>` type with a bitwise reverse complement algorithm. Unifies MPHF construction, introduces approximate fingerprint-based indexing, and updates the pipeline, chunkreader, and storage layouts. Adds code coverage reports and clarifies architectural invariants for improved maintainability.
1402 lines
33 KiB
HTML
1402 lines
33 KiB
HTML
|
||
<!doctype html>
|
||
<html lang="en" class="no-js">
|
||
<head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="icon" href="../../assets/images/favicon.png">
|
||
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
|
||
|
||
|
||
|
||
<title>Query system - obikmer</title>
|
||
|
||
|
||
|
||
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
||
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
|
||
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
|
||
|
||
|
||
|
||
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
|
||
|
||
|
||
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body dir="ltr">
|
||
|
||
|
||
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
|
||
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
|
||
<label class="md-overlay" for="__drawer"></label>
|
||
<div data-md-component="skip">
|
||
|
||
|
||
<a href="#query-system" class="md-skip">
|
||
Skip to content
|
||
</a>
|
||
|
||
</div>
|
||
<div data-md-component="announce">
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<header class="md-header md-header--shadow" data-md-component="header">
|
||
<nav class="md-header__inner md-grid" aria-label="Header">
|
||
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
<label class="md-header__button md-icon" for="__drawer">
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
|
||
</label>
|
||
<div class="md-header__title" data-md-component="header-title">
|
||
<div class="md-header__ellipsis">
|
||
<div class="md-header__topic">
|
||
<span class="md-ellipsis">
|
||
obikmer
|
||
</span>
|
||
</div>
|
||
<div class="md-header__topic" data-md-component="header-topic">
|
||
<span class="md-ellipsis">
|
||
|
||
Query system
|
||
|
||
</span>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
|
||
|
||
|
||
|
||
|
||
</nav>
|
||
|
||
</header>
|
||
|
||
<div class="md-container" data-md-component="container">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<main class="md-main" data-md-component="main">
|
||
<div class="md-main__inner md-grid">
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
|
||
|
||
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
|
||
<label class="md-nav__title" for="__drawer">
|
||
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
|
||
|
||
|
||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
|
||
|
||
</a>
|
||
obikmer
|
||
</label>
|
||
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../.." class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Home
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Theory
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_2">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Theory
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../kmers/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmers and super-kmers
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/encoding/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
DNA encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/entropy/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Entropy filter
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/minimizer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Minimizer selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../theory/indexing/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Partitioning architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_3">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Implementation
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/superkmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
SuperKmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/kmer/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/chunkreader/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Chunk reader
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/pipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Construction pipeline
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/obipipeline/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obipipeline library
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/storage/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
On-disk storage
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/mphf/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
MPHF selection
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Unitig evidence encoding
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Evidence elimination (discussion)
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
obilayeredmap crate
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentCompactIntVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
PersistentBitVec
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/merge/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Merge command
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer filtering (rebuild/dump/unitig)
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item md-nav__item--nested">
|
||
|
||
|
||
|
||
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
|
||
|
||
|
||
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
<span class="md-nav__icon md-icon"></span>
|
||
</label>
|
||
|
||
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
|
||
<label class="md-nav__title" for="__nav_4">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
|
||
|
||
Architecture
|
||
|
||
|
||
</label>
|
||
<ul class="md-nav__list" data-md-scrollfix>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../sequences/invariant/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Sequences
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<li class="md-nav__item">
|
||
<a href="../index_architecture/" class="md-nav__link">
|
||
|
||
|
||
|
||
<span class="md-ellipsis">
|
||
|
||
|
||
Kmer index
|
||
|
||
|
||
|
||
</span>
|
||
|
||
|
||
|
||
</a>
|
||
</li>
|
||
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
|
||
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
|
||
<div class="md-sidebar__scrollwrap">
|
||
<div class="md-sidebar__inner">
|
||
|
||
|
||
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<label class="md-nav__title" for="__toc">
|
||
<span class="md-nav__icon md-icon"></span>
|
||
Table of contents
|
||
</label>
|
||
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#goal" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Goal
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#input" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Input
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#algorithm" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Algorithm
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#findere-z-window-filter" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Findere z-window filter
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Findere z-window filter">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#effective-z-at-query-time" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Effective z at query time
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#layer-lookup-mphflayerfind" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Layer lookup: MphfLayer::find
|
||
|
||
</span>
|
||
</a>
|
||
|
||
<nav class="md-nav" aria-label="Layer lookup: MphfLayer::find">
|
||
<ul class="md-nav__list">
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#querylayer-variant-selection" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
QueryLayer variant selection
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
</nav>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#presence-count-mode-at-query-time" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Presence / count mode at query time
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#coverage-vectors-detail" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Coverage vectors (--detail)
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#kmer_missing-semantics" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
kmer_missing semantics
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#output-format" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Output format
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#annotation-schema" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Annotation schema
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#cli" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
CLI
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
<li class="md-nav__item">
|
||
<a href="#future-work" class="md-nav__link">
|
||
<span class="md-ellipsis">
|
||
|
||
Future work
|
||
|
||
</span>
|
||
</a>
|
||
|
||
</li>
|
||
|
||
</ul>
|
||
|
||
</nav>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
<div class="md-content" data-md-component="content">
|
||
|
||
<article class="md-content__inner md-typeset">
|
||
|
||
|
||
|
||
|
||
|
||
<h1 id="query-system">Query system</h1>
|
||
<h2 id="goal">Goal</h2>
|
||
<p>Given a set of query sequences, determine for each sequence how many of its k-mers are found in the index and, for each indexed genome, how many k-mers match. The query system is the foundation for read classification and sequence-to-genome mapping.</p>
|
||
<hr />
|
||
<h2 id="input">Input</h2>
|
||
<ul>
|
||
<li>Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported). GenBank flat files are not supported at query time (only at index time).</li>
|
||
<li>Sequences shorter than k bases are silently skipped.</li>
|
||
<li>Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time).</li>
|
||
</ul>
|
||
<hr />
|
||
<h2 id="algorithm">Algorithm</h2>
|
||
<p>The query follows the same superkmer-based partitioning strategy used at indexing time.</p>
|
||
<div class="highlight"><pre><span></span><code>for each chunk of sequences (parallel workers via obipipeline):
|
||
build QueryBatch: decompose all sequences into s-mers via superkmers, deduplicate
|
||
allocate seq_results[seq_idx][smer_pos] = None ← per-sequence s-mer result vectors
|
||
split superkmers by partition via minimiser hash
|
||
for each partition p:
|
||
query_partition(p, superkmers_routed_to_p)
|
||
→ load QueryLayer(s) for p
|
||
→ for each s-mer in each superkmer: MphfLayer::find(smer)
|
||
fill seq_results[seq_idx][kmer_offset + j] from partition results
|
||
for each sequence:
|
||
apply_findere(seq_results[seq_idx], effective_z) ← per full sequence
|
||
accumulate confirmed k-mer results into acc and cov
|
||
emit annotated sequences
|
||
</code></pre></div>
|
||
<p>Superkmers that appear more than once in the batch (same sequence or across sequences) are deduplicated: each unique <code>RoutableSuperKmer</code> is queried once per partition, and the result is broadcast to every <code>SKDesc</code> entry that references it.</p>
|
||
<p><strong>Findere requires full-sequence aggregation.</strong> <code>apply_findere</code> is applied once per sequence on the complete s-mer result vector, after all partitions have contributed. Applying it per superkmer would produce false negatives at superkmer boundaries, where the z-window spans two superkmers.</p>
|
||
<p>Batches are processed in parallel via <code>obipipeline</code> workers; the <code>--threads</code> flag controls the number of worker threads.</p>
|
||
<hr />
|
||
<h2 id="findere-z-window-filter">Findere z-window filter</h2>
|
||
<p>For approximate index modes, the index physically stores s-mers of size <code>s = k_user − z + 1</code>. At query time, <code>set_k(s)</code> is in effect, so queries naturally produce s-mer results. <code>apply_findere</code> then aggregates z consecutive s-mer results into one k_user-mer answer:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">apply_findere</span><span class="p">(</span>
|
||
<span class="w"> </span><span class="n">results</span><span class="p">:</span><span class="w"> </span><span class="kp">&</span><span class="p">[</span><span class="nb">Option</span><span class="o"><</span><span class="nb">Box</span><span class="o"><</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">>></span><span class="p">],</span><span class="w"> </span><span class="c1">// N s-mer results</span>
|
||
<span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">n_genomes</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="p">)</span><span class="w"> </span><span class="p">-></span><span class="w"> </span><span class="nb">Vec</span><span class="o"><</span><span class="nb">Option</span><span class="o"><</span><span class="nb">Box</span><span class="o"><</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">>>></span><span class="w"> </span><span class="c1">// N − z + 1 k_user-mer results</span>
|
||
</code></pre></div>
|
||
<p>Input length N (s-mers), output length N − z + 1 (k_user-mers).</p>
|
||
<p>For each genome g independently, a sliding window of size z scans the input. Output position i is confirmed for genome g iff all z values <code>results[i..i+z][g]</code> are nonzero (<code>None</code> counts as zero for all genomes). The scan is O(n) per genome.</p>
|
||
<p>Output values come from <code>results[i]</code> (leftmost s-mer of each window); genomes not confirmed are zeroed. If all genomes are zero, the position is returned as <code>None</code>.</p>
|
||
<p><strong>Short sequences</strong>: when the s-mer count is less than z, no complete window can form — <code>apply_findere</code> returns an empty vector. K-mers from sequences shorter than k_user are not emitted.</p>
|
||
<p><strong>Exact indexes</strong>: <code>z = 1</code>, <code>apply_findere</code> is a passthrough (output length = input length).</p>
|
||
<h3 id="effective-z-at-query-time">Effective z at query time</h3>
|
||
<p><code>effective_z</code> is resolved at the start of <code>run()</code>:</p>
|
||
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">effective_z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">findere_z</span><span class="p">.</span><span class="n">unwrap_or_else</span><span class="p">(</span><span class="o">||</span><span class="w"> </span><span class="k">match</span><span class="w"> </span><span class="n">idx</span><span class="p">.</span><span class="n">meta</span><span class="p">().</span><span class="n">config</span><span class="p">.</span><span class="n">evidence</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">z</span><span class="p">,</span><span class="w"> </span><span class="o">..</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Hybrid</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">z</span><span class="p">,</span><span class="w"> </span><span class="o">..</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="o">=></span><span class="w"> </span><span class="n">z</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
|
||
<span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Exact</span><span class="w"> </span><span class="o">=></span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
|
||
<span class="p">});</span>
|
||
</code></pre></div>
|
||
<p>The <code>-z</code> CLI option overrides the index metadata value. A higher z increases stringency (lower FP, some true positives may be discarded at sequence ends); a lower z increases sensitivity.</p>
|
||
<hr />
|
||
<h2 id="layer-lookup-mphflayerfind">Layer lookup: <code>MphfLayer::find</code></h2>
|
||
<p><code>MphfLayer::open(dir, mode: &IndexMode)</code> receives the mode from <code>PartitionMeta</code> — no per-layer file is read. The caller (<code>QueryLayer</code>) never chooses the dispatch path: it is fixed at open time by <code>LayerEvidence</code>. See <a href="../../implementation/obilayeredmap/">obilayeredmap</a> for the full <code>find</code> / <code>find_strict</code> API.</p>
|
||
<h3 id="querylayer-variant-selection"><code>QueryLayer</code> variant selection</h3>
|
||
<p><code>QueryLayer::open</code> in <code>query_layer.rs</code> selects the data matrix to pair with <code>MphfLayer</code>:</p>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>Condition</th>
|
||
<th>Variant</th>
|
||
<th>Data returned per k-mer</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>with_counts=true</code> and <code>counts/</code> exists</td>
|
||
<td><code>Count</code></td>
|
||
<td>raw count per genome</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>presence/</code> exists</td>
|
||
<td><code>Presence</code></td>
|
||
<td>0/1 per genome (bit matrix)</td>
|
||
</tr>
|
||
<tr>
|
||
<td>only <code>counts/</code> exists</td>
|
||
<td><code>Count</code></td>
|
||
<td>counts used as-is</td>
|
||
</tr>
|
||
<tr>
|
||
<td>neither exists</td>
|
||
<td><code>SetOnly</code></td>
|
||
<td>1 for every genome</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<hr />
|
||
<h2 id="presence-count-mode-at-query-time">Presence / count mode at query time</h2>
|
||
<p>The <code>--force-presence</code> flag and <code>--presence-threshold</code> control how per-genome values are accumulated, independently of what the index stores:</p>
|
||
<div class="highlight"><pre><span></span><code>genome_totals[g] += if presence { u32::from(v >= threshold) } else { v }
|
||
</code></pre></div>
|
||
<p><code>presence</code> is true when <code>--force-presence</code> is set or when the index has no counts (<code>!with_counts</code>). The default <code>presence_threshold</code> is 1, so any nonzero count counts as a match.</p>
|
||
<hr />
|
||
<h2 id="coverage-vectors-detail">Coverage vectors (<code>--detail</code>)</h2>
|
||
<p>When <code>--detail</code> is requested, a 3-D accumulator <code>cov[seq_idx][genome][kmer_pos]</code> is allocated after all partitions are queried, with dimensions derived from <code>n_kmers_out = n_smers − z + 1</code> (k_user-mer positions, not s-mer positions):</p>
|
||
<div class="highlight"><pre><span></span><code>cov[seq_idx][g][pos] += contribution
|
||
where pos is the k_user-mer index in the filtered (post-Findere) vector
|
||
</code></pre></div>
|
||
<p>Coverage reflects confirmed k_user-mers only. The vectors are emitted in the JSON annotation under the key <code>"coverage"</code>.</p>
|
||
<hr />
|
||
<h2 id="kmer_missing-semantics"><code>kmer_missing</code> semantics</h2>
|
||
<p><code>kmer_missing</code> counts k_user-mer positions where the first s-mer (<code>seq_results[seq_idx][pos]</code>) is <code>None</code> — i.e. absent from the index entirely. K-mers where the z-window fails because a later s-mer is absent or zero are not counted as missing (the first s-mer being present is used as proxy for index membership).</p>
|
||
<hr />
|
||
<h2 id="output-format">Output format</h2>
|
||
<p>Output sequences are written in <strong>OBITools4 format</strong>: the original sequence with a JSON annotation map in the title line.</p>
|
||
<div class="highlight"><pre><span></span><code>>read_id {"kmer_count":59,"kmer_strict_matches":{"genome_a":42,"genome_b":7}}
|
||
ATCGATCG...
|
||
</code></pre></div>
|
||
<p>With <code>--detail</code>:</p>
|
||
<div class="highlight"><pre><span></span><code>>read_id {"kmer_count":59,"kmer_strict_matches":{...},"coverage":{"genome_a":[0,1,2,...],...}}
|
||
ATCGATCG...
|
||
</code></pre></div>
|
||
<p>Genome keys follow the iteration order of <code>meta.genomes</code>.</p>
|
||
<hr />
|
||
<h2 id="annotation-schema">Annotation schema</h2>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>Key</th>
|
||
<th>Type</th>
|
||
<th>Condition</th>
|
||
<th>Semantics</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>kmer_count</code></td>
|
||
<td>int</td>
|
||
<td>always</td>
|
||
<td>k-mers confirmed (post-Findere) with at least one genome match</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>kmer_missing</code></td>
|
||
<td>int</td>
|
||
<td><code>--count-missing</code></td>
|
||
<td>k-mers absent from the index entirely (pre-Findere None)</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>kmer_strict_matches</code></td>
|
||
<td>object</td>
|
||
<td>always</td>
|
||
<td>per-genome accumulated value (label → count or 0/1)</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>coverage</code></td>
|
||
<td>object</td>
|
||
<td><code>--detail</code></td>
|
||
<td>per-genome array of per-position contributions (label → [u32])</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><code>kmer_count + kmer_missing</code> ≤ total k_user-mers in the sequence. The gap corresponds to k_user-mers whose z-window was not fully confirmed (at least one s-mer absent or zero for all genomes) but whose first s-mer was present in the index.</p>
|
||
<hr />
|
||
<h2 id="cli">CLI</h2>
|
||
<div class="highlight"><pre><span></span><code>obikmer query <index> [--detail] [--mismatch] [--count-missing]
|
||
[--force-presence] [--presence-threshold <n>]
|
||
[-z <z>] [-T <threads>]
|
||
<query.fa> [<query2.fa> ...]
|
||
</code></pre></div>
|
||
<table>
|
||
<thead>
|
||
<tr>
|
||
<th>Option</th>
|
||
<th>Default</th>
|
||
<th>Semantics</th>
|
||
</tr>
|
||
</thead>
|
||
<tbody>
|
||
<tr>
|
||
<td><code>-z</code> / <code>--findere-z</code></td>
|
||
<td>from index metadata</td>
|
||
<td>Override Findere z parameter</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--detail</code></td>
|
||
<td>off</td>
|
||
<td>Emit per-position coverage vectors in JSON</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--count-missing</code></td>
|
||
<td>off</td>
|
||
<td>Add <code>kmer_missing</code> field to JSON</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--force-presence</code></td>
|
||
<td>off</td>
|
||
<td>Report 0/1 per genome regardless of index counts</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>--presence-threshold</code></td>
|
||
<td>1</td>
|
||
<td>Minimum count to declare genome present</td>
|
||
</tr>
|
||
<tr>
|
||
<td><code>-T</code> / <code>--threads</code></td>
|
||
<td>all CPUs</td>
|
||
<td>Worker threads</td>
|
||
</tr>
|
||
</tbody>
|
||
</table>
|
||
<p><code>--mismatch</code> is accepted but currently ignored with a warning on stderr.</p>
|
||
<hr />
|
||
<h2 id="future-work">Future work</h2>
|
||
<ul>
|
||
<li><strong><code>--mismatch</code></strong>: 1-mismatch approximate matching — generate <code>3·k</code> single-substitution variants per k-mer, look each up independently.</li>
|
||
<li><strong>Read classification</strong> (<code>--classify</code>): assign each read to the genome with the highest match score.</li>
|
||
<li><strong>Whitelist / blacklist filtering</strong>: threshold-based accept/reject on per-genome match scores.</li>
|
||
</ul>
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
</article>
|
||
</div>
|
||
|
||
|
||
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
|
||
</div>
|
||
|
||
</main>
|
||
|
||
<footer class="md-footer">
|
||
|
||
<div class="md-footer-meta md-typeset">
|
||
<div class="md-footer-meta__inner md-grid">
|
||
<div class="md-copyright">
|
||
|
||
|
||
Made with
|
||
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
|
||
Material for MkDocs
|
||
</a>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
<div class="md-dialog" data-md-component="dialog">
|
||
<div class="md-dialog__inner md-typeset"></div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
|
||
|
||
|
||
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
|
||
|
||
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
|
||
</body>
|
||
</html> |