Files
obikmer/doc/architecture/query/index.html
T

1402 lines
33 KiB
HTML
Raw Normal View History

<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<link rel="icon" href="../../assets/images/favicon.png">
<meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
<title>Query system - obikmer</title>
<link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
<style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
<script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
</head>
<body dir="ltr">
<input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
<input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
<label class="md-overlay" for="__drawer"></label>
<div data-md-component="skip">
<a href="#query-system" class="md-skip">
Skip to content
</a>
</div>
<div data-md-component="announce">
</div>
<header class="md-header md-header--shadow" data-md-component="header">
<nav class="md-header__inner md-grid" aria-label="Header">
<a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
<label class="md-header__button md-icon" for="__drawer">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
</label>
<div class="md-header__title" data-md-component="header-title">
<div class="md-header__ellipsis">
<div class="md-header__topic">
<span class="md-ellipsis">
obikmer
</span>
</div>
<div class="md-header__topic" data-md-component="header-topic">
<span class="md-ellipsis">
Query system
</span>
</div>
</div>
</div>
<script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
</nav>
</header>
<div class="md-container" data-md-component="container">
<main class="md-main" data-md-component="main">
<div class="md-main__inner md-grid">
<div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
<label class="md-nav__title" for="__drawer">
<a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>
</a>
obikmer
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../.." class="md-nav__link">
<span class="md-ellipsis">
Home
</span>
</a>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" >
<label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
<span class="md-ellipsis">
Theory
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_2">
<span class="md-nav__icon md-icon"></span>
Theory
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../kmers/" class="md-nav__link">
<span class="md-ellipsis">
Kmers and super-kmers
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/encoding/" class="md-nav__link">
<span class="md-ellipsis">
DNA encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/entropy/" class="md-nav__link">
<span class="md-ellipsis">
Entropy filter
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/minimizer/" class="md-nav__link">
<span class="md-ellipsis">
Minimizer selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../theory/indexing/" class="md-nav__link">
<span class="md-ellipsis">
Partitioning architecture
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
<label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
<span class="md-ellipsis">
Implementation
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_3">
<span class="md-nav__icon md-icon"></span>
Implementation
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../../implementation/superkmer/" class="md-nav__link">
<span class="md-ellipsis">
SuperKmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/kmer/" class="md-nav__link">
<span class="md-ellipsis">
Kmer
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/chunkreader/" class="md-nav__link">
<span class="md-ellipsis">
Chunk reader
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/pipeline/" class="md-nav__link">
<span class="md-ellipsis">
Construction pipeline
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/obipipeline/" class="md-nav__link">
<span class="md-ellipsis">
obipipeline library
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/storage/" class="md-nav__link">
<span class="md-ellipsis">
On-disk storage
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/mphf/" class="md-nav__link">
<span class="md-ellipsis">
MPHF selection
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/unitig_evidence/" class="md-nav__link">
<span class="md-ellipsis">
Unitig evidence encoding
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/evidence_elimination/" class="md-nav__link">
<span class="md-ellipsis">
Evidence elimination (discussion)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/obilayeredmap/" class="md-nav__link">
<span class="md-ellipsis">
obilayeredmap crate
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentCompactIntVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
<span class="md-ellipsis">
PersistentBitVec
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/merge/" class="md-nav__link">
<span class="md-ellipsis">
Merge command
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../../implementation/rebuild_filter/" class="md-nav__link">
<span class="md-ellipsis">
Kmer filtering (rebuild/dump/unitig)
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item md-nav__item--nested">
<input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
<label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
<span class="md-ellipsis">
Architecture
</span>
<span class="md-nav__icon md-icon"></span>
</label>
<nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
<label class="md-nav__title" for="__nav_4">
<span class="md-nav__icon md-icon"></span>
Architecture
</label>
<ul class="md-nav__list" data-md-scrollfix>
<li class="md-nav__item">
<a href="../sequences/invariant/" class="md-nav__link">
<span class="md-ellipsis">
Sequences
</span>
</a>
</li>
<li class="md-nav__item">
<a href="../index_architecture/" class="md-nav__link">
<span class="md-ellipsis">
Kmer index
</span>
</a>
</li>
</ul>
</nav>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
<div class="md-sidebar__scrollwrap">
<div class="md-sidebar__inner">
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
<label class="md-nav__title" for="__toc">
<span class="md-nav__icon md-icon"></span>
Table of contents
</label>
<ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
<li class="md-nav__item">
<a href="#goal" class="md-nav__link">
<span class="md-ellipsis">
Goal
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#input" class="md-nav__link">
<span class="md-ellipsis">
Input
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#algorithm" class="md-nav__link">
<span class="md-ellipsis">
Algorithm
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#findere-z-window-filter" class="md-nav__link">
<span class="md-ellipsis">
Findere z-window filter
</span>
</a>
<nav class="md-nav" aria-label="Findere z-window filter">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#effective-z-at-query-time" class="md-nav__link">
<span class="md-ellipsis">
Effective z at query time
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#layer-lookup-mphflayerfind" class="md-nav__link">
<span class="md-ellipsis">
Layer lookup: MphfLayer::find
</span>
</a>
<nav class="md-nav" aria-label="Layer lookup: MphfLayer::find">
<ul class="md-nav__list">
<li class="md-nav__item">
<a href="#querylayer-variant-selection" class="md-nav__link">
<span class="md-ellipsis">
QueryLayer variant selection
</span>
</a>
</li>
</ul>
</nav>
</li>
<li class="md-nav__item">
<a href="#presence-count-mode-at-query-time" class="md-nav__link">
<span class="md-ellipsis">
Presence / count mode at query time
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#coverage-vectors-detail" class="md-nav__link">
<span class="md-ellipsis">
Coverage vectors (--detail)
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#kmer_missing-semantics" class="md-nav__link">
<span class="md-ellipsis">
kmer_missing semantics
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#output-format" class="md-nav__link">
<span class="md-ellipsis">
Output format
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#annotation-schema" class="md-nav__link">
<span class="md-ellipsis">
Annotation schema
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#cli" class="md-nav__link">
<span class="md-ellipsis">
CLI
</span>
</a>
</li>
<li class="md-nav__item">
<a href="#future-work" class="md-nav__link">
<span class="md-ellipsis">
Future work
</span>
</a>
</li>
</ul>
</nav>
</div>
</div>
</div>
<div class="md-content" data-md-component="content">
<article class="md-content__inner md-typeset">
<h1 id="query-system">Query system</h1>
<h2 id="goal">Goal</h2>
<p>Given a set of query sequences, determine for each sequence how many of its k-mers are found in the index and, for each indexed genome, how many k-mers match. The query system is the foundation for read classification and sequence-to-genome mapping.</p>
<hr />
<h2 id="input">Input</h2>
<ul>
<li>Query sequences in FASTA or FASTQ format (gzip supported, streaming stdin supported). GenBank flat files are not supported at query time (only at index time).</li>
<li>Sequences shorter than k bases are silently skipped.</li>
<li>Non-ACGT characters are handled by the superkmer decomposition layer: they act as hard breaks, producing shorter superkmers (identical to the behaviour at indexing time).</li>
</ul>
<hr />
<h2 id="algorithm">Algorithm</h2>
<p>The query follows the same superkmer-based partitioning strategy used at indexing time.</p>
<div class="highlight"><pre><span></span><code>for each chunk of sequences (parallel workers via obipipeline):
build QueryBatch: decompose all sequences into s-mers via superkmers, deduplicate
allocate seq_results[seq_idx][smer_pos] = None ← per-sequence s-mer result vectors
split superkmers by partition via minimiser hash
for each partition p:
query_partition(p, superkmers_routed_to_p)
→ load QueryLayer(s) for p
→ for each s-mer in each superkmer: MphfLayer::find(smer)
fill seq_results[seq_idx][kmer_offset + j] from partition results
for each sequence:
apply_findere(seq_results[seq_idx], effective_z) ← per full sequence
accumulate confirmed k-mer results into acc and cov
emit annotated sequences
</code></pre></div>
<p>Superkmers that appear more than once in the batch (same sequence or across sequences) are deduplicated: each unique <code>RoutableSuperKmer</code> is queried once per partition, and the result is broadcast to every <code>SKDesc</code> entry that references it.</p>
<p><strong>Findere requires full-sequence aggregation.</strong> <code>apply_findere</code> is applied once per sequence on the complete s-mer result vector, after all partitions have contributed. Applying it per superkmer would produce false negatives at superkmer boundaries, where the z-window spans two superkmers.</p>
<p>Batches are processed in parallel via <code>obipipeline</code> workers; the <code>--threads</code> flag controls the number of worker threads.</p>
<hr />
<h2 id="findere-z-window-filter">Findere z-window filter</h2>
<p>For approximate index modes, the index physically stores s-mers of size <code>s = k_user z + 1</code>. At query time, <code>set_k(s)</code> is in effect, so queries naturally produce s-mer results. <code>apply_findere</code> then aggregates z consecutive s-mer results into one k_user-mer answer:</p>
<div class="highlight"><pre><span></span><code><span class="k">fn</span><span class="w"> </span><span class="nf">apply_findere</span><span class="p">(</span>
<span class="w"> </span><span class="n">results</span><span class="p">:</span><span class="w"> </span><span class="kp">&amp;</span><span class="p">[</span><span class="nb">Option</span><span class="o">&lt;</span><span class="nb">Box</span><span class="o">&lt;</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">&gt;&gt;</span><span class="p">],</span><span class="w"> </span><span class="c1">// N s-mer results</span>
<span class="w"> </span><span class="n">z</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="w"> </span><span class="n">n_genomes</span><span class="p">:</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="p">)</span><span class="w"> </span><span class="p">-&gt;</span><span class="w"> </span><span class="nb">Vec</span><span class="o">&lt;</span><span class="nb">Option</span><span class="o">&lt;</span><span class="nb">Box</span><span class="o">&lt;</span><span class="p">[</span><span class="kt">u32</span><span class="p">]</span><span class="o">&gt;&gt;&gt;</span><span class="w"> </span><span class="c1">// N z + 1 k_user-mer results</span>
</code></pre></div>
<p>Input length N (s-mers), output length N z + 1 (k_user-mers).</p>
<p>For each genome g independently, a sliding window of size z scans the input. Output position i is confirmed for genome g iff all z values <code>results[i..i+z][g]</code> are nonzero (<code>None</code> counts as zero for all genomes). The scan is O(n) per genome.</p>
<p>Output values come from <code>results[i]</code> (leftmost s-mer of each window); genomes not confirmed are zeroed. If all genomes are zero, the position is returned as <code>None</code>.</p>
<p><strong>Short sequences</strong>: when the s-mer count is less than z, no complete window can form — <code>apply_findere</code> returns an empty vector. K-mers from sequences shorter than k_user are not emitted.</p>
<p><strong>Exact indexes</strong>: <code>z = 1</code>, <code>apply_findere</code> is a passthrough (output length = input length).</p>
<h3 id="effective-z-at-query-time">Effective z at query time</h3>
<p><code>effective_z</code> is resolved at the start of <code>run()</code>:</p>
<div class="highlight"><pre><span></span><code><span class="kd">let</span><span class="w"> </span><span class="n">effective_z</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">args</span><span class="p">.</span><span class="n">findere_z</span><span class="p">.</span><span class="n">unwrap_or_else</span><span class="p">(</span><span class="o">||</span><span class="w"> </span><span class="k">match</span><span class="w"> </span><span class="n">idx</span><span class="p">.</span><span class="n">meta</span><span class="p">().</span><span class="n">config</span><span class="p">.</span><span class="n">evidence</span><span class="w"> </span><span class="p">{</span>
<span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Approx</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">z</span><span class="p">,</span><span class="w"> </span><span class="o">..</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="o">|</span><span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Hybrid</span><span class="w"> </span><span class="p">{</span><span class="w"> </span><span class="n">z</span><span class="p">,</span><span class="w"> </span><span class="o">..</span><span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="o">=&gt;</span><span class="w"> </span><span class="n">z</span><span class="w"> </span><span class="k">as</span><span class="w"> </span><span class="kt">usize</span><span class="p">,</span>
<span class="w"> </span><span class="n">IndexMode</span><span class="p">::</span><span class="n">Exact</span><span class="w"> </span><span class="o">=&gt;</span><span class="w"> </span><span class="mi">1</span><span class="p">,</span>
<span class="p">});</span>
</code></pre></div>
<p>The <code>-z</code> CLI option overrides the index metadata value. A higher z increases stringency (lower FP, some true positives may be discarded at sequence ends); a lower z increases sensitivity.</p>
<hr />
<h2 id="layer-lookup-mphflayerfind">Layer lookup: <code>MphfLayer::find</code></h2>
<p><code>MphfLayer::open(dir, mode: &amp;IndexMode)</code> receives the mode from <code>PartitionMeta</code> — no per-layer file is read. The caller (<code>QueryLayer</code>) never chooses the dispatch path: it is fixed at open time by <code>LayerEvidence</code>. See <a href="../../implementation/obilayeredmap/">obilayeredmap</a> for the full <code>find</code> / <code>find_strict</code> API.</p>
<h3 id="querylayer-variant-selection"><code>QueryLayer</code> variant selection</h3>
<p><code>QueryLayer::open</code> in <code>query_layer.rs</code> selects the data matrix to pair with <code>MphfLayer</code>:</p>
<table>
<thead>
<tr>
<th>Condition</th>
<th>Variant</th>
<th>Data returned per k-mer</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>with_counts=true</code> and <code>counts/</code> exists</td>
<td><code>Count</code></td>
<td>raw count per genome</td>
</tr>
<tr>
<td><code>presence/</code> exists</td>
<td><code>Presence</code></td>
<td>0/1 per genome (bit matrix)</td>
</tr>
<tr>
<td>only <code>counts/</code> exists</td>
<td><code>Count</code></td>
<td>counts used as-is</td>
</tr>
<tr>
<td>neither exists</td>
<td><code>SetOnly</code></td>
<td>1 for every genome</td>
</tr>
</tbody>
</table>
<hr />
<h2 id="presence-count-mode-at-query-time">Presence / count mode at query time</h2>
<p>The <code>--force-presence</code> flag and <code>--presence-threshold</code> control how per-genome values are accumulated, independently of what the index stores:</p>
<div class="highlight"><pre><span></span><code>genome_totals[g] += if presence { u32::from(v &gt;= threshold) } else { v }
</code></pre></div>
<p><code>presence</code> is true when <code>--force-presence</code> is set or when the index has no counts (<code>!with_counts</code>). The default <code>presence_threshold</code> is 1, so any nonzero count counts as a match.</p>
<hr />
<h2 id="coverage-vectors-detail">Coverage vectors (<code>--detail</code>)</h2>
<p>When <code>--detail</code> is requested, a 3-D accumulator <code>cov[seq_idx][genome][kmer_pos]</code> is allocated after all partitions are queried, with dimensions derived from <code>n_kmers_out = n_smers z + 1</code> (k_user-mer positions, not s-mer positions):</p>
<div class="highlight"><pre><span></span><code>cov[seq_idx][g][pos] += contribution
where pos is the k_user-mer index in the filtered (post-Findere) vector
</code></pre></div>
<p>Coverage reflects confirmed k_user-mers only. The vectors are emitted in the JSON annotation under the key <code>"coverage"</code>.</p>
<hr />
<h2 id="kmer_missing-semantics"><code>kmer_missing</code> semantics</h2>
<p><code>kmer_missing</code> counts k_user-mer positions where the first s-mer (<code>seq_results[seq_idx][pos]</code>) is <code>None</code> — i.e. absent from the index entirely. K-mers where the z-window fails because a later s-mer is absent or zero are not counted as missing (the first s-mer being present is used as proxy for index membership).</p>
<hr />
<h2 id="output-format">Output format</h2>
<p>Output sequences are written in <strong>OBITools4 format</strong>: the original sequence with a JSON annotation map in the title line.</p>
<div class="highlight"><pre><span></span><code>&gt;read_id {&quot;kmer_count&quot;:59,&quot;kmer_strict_matches&quot;:{&quot;genome_a&quot;:42,&quot;genome_b&quot;:7}}
ATCGATCG...
</code></pre></div>
<p>With <code>--detail</code>:</p>
<div class="highlight"><pre><span></span><code>&gt;read_id {&quot;kmer_count&quot;:59,&quot;kmer_strict_matches&quot;:{...},&quot;coverage&quot;:{&quot;genome_a&quot;:[0,1,2,...],...}}
ATCGATCG...
</code></pre></div>
<p>Genome keys follow the iteration order of <code>meta.genomes</code>.</p>
<hr />
<h2 id="annotation-schema">Annotation schema</h2>
<table>
<thead>
<tr>
<th>Key</th>
<th>Type</th>
<th>Condition</th>
<th>Semantics</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>kmer_count</code></td>
<td>int</td>
<td>always</td>
<td>k-mers confirmed (post-Findere) with at least one genome match</td>
</tr>
<tr>
<td><code>kmer_missing</code></td>
<td>int</td>
<td><code>--count-missing</code></td>
<td>k-mers absent from the index entirely (pre-Findere None)</td>
</tr>
<tr>
<td><code>kmer_strict_matches</code></td>
<td>object</td>
<td>always</td>
<td>per-genome accumulated value (label → count or 0/1)</td>
</tr>
<tr>
<td><code>coverage</code></td>
<td>object</td>
<td><code>--detail</code></td>
<td>per-genome array of per-position contributions (label → [u32])</td>
</tr>
</tbody>
</table>
<p><code>kmer_count + kmer_missing</code> ≤ total k_user-mers in the sequence. The gap corresponds to k_user-mers whose z-window was not fully confirmed (at least one s-mer absent or zero for all genomes) but whose first s-mer was present in the index.</p>
<hr />
<h2 id="cli">CLI</h2>
<div class="highlight"><pre><span></span><code>obikmer query &lt;index&gt; [--detail] [--mismatch] [--count-missing]
[--force-presence] [--presence-threshold &lt;n&gt;]
[-z &lt;z&gt;] [-T &lt;threads&gt;]
&lt;query.fa&gt; [&lt;query2.fa&gt; ...]
</code></pre></div>
<table>
<thead>
<tr>
<th>Option</th>
<th>Default</th>
<th>Semantics</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>-z</code> / <code>--findere-z</code></td>
<td>from index metadata</td>
<td>Override Findere z parameter</td>
</tr>
<tr>
<td><code>--detail</code></td>
<td>off</td>
<td>Emit per-position coverage vectors in JSON</td>
</tr>
<tr>
<td><code>--count-missing</code></td>
<td>off</td>
<td>Add <code>kmer_missing</code> field to JSON</td>
</tr>
<tr>
<td><code>--force-presence</code></td>
<td>off</td>
<td>Report 0/1 per genome regardless of index counts</td>
</tr>
<tr>
<td><code>--presence-threshold</code></td>
<td>1</td>
<td>Minimum count to declare genome present</td>
</tr>
<tr>
<td><code>-T</code> / <code>--threads</code></td>
<td>all CPUs</td>
<td>Worker threads</td>
</tr>
</tbody>
</table>
<p><code>--mismatch</code> is accepted but currently ignored with a warning on stderr.</p>
<hr />
<h2 id="future-work">Future work</h2>
<ul>
<li><strong><code>--mismatch</code></strong>: 1-mismatch approximate matching — generate <code>3·k</code> single-substitution variants per k-mer, look each up independently.</li>
<li><strong>Read classification</strong> (<code>--classify</code>): assign each read to the genome with the highest match score.</li>
<li><strong>Whitelist / blacklist filtering</strong>: threshold-based accept/reject on per-genome match scores.</li>
</ul>
</article>
</div>
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
</div>
</main>
<footer class="md-footer">
<div class="md-footer-meta md-typeset">
<div class="md-footer-meta__inner md-grid">
<div class="md-copyright">
Made with
<a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
Material for MkDocs
</a>
</div>
</div>
</div>
</footer>
</div>
<div class="md-dialog" data-md-component="dialog">
<div class="md-dialog__inner md-typeset"></div>
</div>
<script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
<script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
<script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
</body>
</html>