2026-06-22 08:47:24 +00:00
42 changed files with 2585 additions and 84 deletions
@@ -9,3 +9,13 @@ data-stress
 ./**/*.json
 *.bin
 Betula_exilis--IGA-24-33
+benchmark/genomes
+benchmark/simulated_data
+benchmark/specimen_index_presence
+benchmark/specimen_index_count
+benchmark/global_index_presence
+benchmark/global_index_count
+benchmark/stats
+benchmark/reference_index
+benchmark/specific_index_count
+benchmark/specific_index_presence
@@ -0,0 +1,2 @@
+/cache
+/project.local.yml
@@ -0,0 +1,133 @@
+# the name by which the project can be referenced within Serena
+project_name: "obikmer"
+
+
+# list of languages for which language servers are started; choose from:
+#   al                  angular             ansible             bash                clojure
+#   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
+#   dart                elixir              elm                 erlang              fortran
+#   fsharp              go                  groovy              haskell             haxe
+#   hlsl                html                java                json                julia
+#   kotlin              lean4               lua                 luau                markdown
+#   matlab              msl                 nix                 ocaml               pascal
+#   perl                php                 php_phpactor        powershell          python
+#   python_jedi         python_ty           r                   rego                ruby
+#   ruby_solargraph     rust                scala               scss                solidity
+#   svelte              swift               systemverilog       terraform           toml
+#   typescript          typescript_vts      vue                 yaml                zig
+#   (This list may be outdated. For the current list, see values of Language enum here:
+#   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
+#   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
+# Note:
+#   - For C, use cpp
+#   - For JavaScript, use typescript
+#   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
+#   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
+#   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
+#   - For Free Pascal/Lazarus, use pascal
+# Special requirements:
+#   Some languages require additional setup/installations.
+#   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
+# When using multiple languages, the first language server that supports a given file will be used for that file.
+# The first language is the default language and the respective language server will be used as a fallback.
+# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
+languages:
+- rust
+
+# the encoding used by text files in the project
+# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
+encoding: "utf-8"
+
+# line ending convention to use when writing source files.
+# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
+# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
+line_ending:
+
+# The language backend to use for this project.
+# If not set, the global setting from serena_config.yml is used.
+# Valid values: LSP, JetBrains
+# Note: the backend is fixed at startup. If a project with a different backend
+# is activated post-init, an error will be returned.
+language_backend:
+
+# whether to use project's .gitignore files to ignore files
+ignore_all_files_in_gitignore: true
+
+# advanced configuration option allowing to configure language server-specific options.
+# Maps the language key to the options.
+# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
+# No documentation on options means no options are available.
+ls_specific_settings: {}
+
+# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
+# Paths can be absolute or relative to the project root.
+# Each folder is registered as an LSP workspace folder, enabling language servers to discover
+# symbols and references across package boundaries.
+# Currently supported for: TypeScript.
+# Example:
+#   additional_workspace_folders:
+#     - ../sibling-package
+#     - ../shared-lib
+additional_workspace_folders: []
+
+# list of additional paths to ignore in this project.
+# Same syntax as gitignore, so you can use * and **.
+# Note: global ignored_paths from serena_config.yml are also applied additively.
+ignored_paths: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# list of tool names to exclude.
+# This extends the existing exclusions (e.g. from the global configuration)
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+excluded_tools: []
+
+# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
+# This extends the existing inclusions (e.g. from the global configuration).
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+included_optional_tools: []
+
+# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
+# This cannot be combined with non-empty excluded_tools or included_optional_tools.
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+fixed_tools: []
+
+# list of mode names that are to be activated by default, overriding the setting in the global configuration.
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
+# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
+# for this project.
+# This setting can, in turn, be overridden by CLI parameters (--mode).
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+default_modes:
+
+# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+added_modes:
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ""
+
+# time budget (seconds) per tool call for the retrieval of additional symbol information
+# such as docstrings or parameter information.
+# This overrides the corresponding setting in the global configuration; see the documentation there.
+# If null or missing, use the setting from the global configuration.
+symbol_info_budget:
+
+# list of regex patterns which, when matched, mark a memory entry as read‑only.
+# Extends the list from the global configuration, merging the two lists.
+read_only_memory_patterns: []
+
+# list of regex patterns for memories to completely ignore.
+# Matching memories will not appear in list_memories or activate_project output
+# and cannot be accessed via read_memory or write_memory.
+# To access ignored memory files, use the read_file tool on the raw file path.
+# Extends the list from the global configuration, merging the two lists.
+# Example: ["_archive/.*", "_episodes/.*"]
+ignored_memory_patterns: []
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
 ---

 Je continue à poser mes questions et à guider la discussion.
+
+---
+
+## MCP Tools
+
+**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
+
+### Hiérarchie des outils pour ce projet Rust
+
+**Navigation et édition de code → serena en priorité**
+- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
+- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
+- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
+- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
+- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
+- Ne pas utiliser `cclsp` quand serena couvre le besoin
+
+**Analyse architecturale → jcodemunch**
+- Hotspots, couplage, dead code, dépendances entre modules
+- Utiliser avant de refactorer une zone critique
+
+**Raisonnement complexe → sequential-thinking**
+- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
+
+**Documentation de crates → context7**
+- Toujours consulter avant d'utiliser une API de bibliothèque externe
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
 		mkdocs mkdocs-material \
 		mkdocs-mermaid2-plugin \
 		mkdocs-bibtex
+	$(PIP) install --quiet --upgrade InSilicoSeq

 # ── obikmer binary ───────────────────────────────────────────────────────────

@@ -0,0 +1,144 @@
+# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
+BINARY  := ../src/target/release/obikmer
+VENV_PY := ../.venv/bin/python3
+
+GENOMES := $(wildcard genomes/*.fna.gz)
+
+# SPECIMENS, SPECIES, and the full dependency graph are generated by
+# make_deps.py from the genome FASTA headers — like .d files in C.
+# Make rebuilds deps.mk whenever genomes/ changes and restarts.
+-include deps.mk
+
+REF_NPZS              := $(SPECIMENS:%=reference_index/%.npz)
+PRESENCE_DONE         := $(SPECIMENS:%=specimen_index_presence/%/index.done)
+PRESENCE_STATS        := $(SPECIMENS:%=stats/indexing_presence/%.stats)
+COUNT_DONE            := $(SPECIMENS:%=specimen_index_count/%/index.done)
+COUNT_STATS           := $(SPECIMENS:%=stats/indexing_count/%.stats)
+VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
+VERIFY_COUNT_STATS    := $(SPECIMENS:%=stats/verify_count/%.stats)
+SPECIFIC_PRESENCE_DONE  := $(SPECIES:%=specific_index_presence/%/index.done)
+SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
+SPECIFIC_COUNT_DONE     := $(SPECIES:%=specific_index_count/%/index.done)
+SPECIFIC_COUNT_STATS    := $(SPECIES:%=stats/specific_kmer_count/%.stats)
+SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
+
+.NOTPARALLEL:
+
+.PHONY: all simulate reference \
+        index_presence index_count \
+        aggregate_index_presence aggregate_index_count \
+        merge_presence merge_count \
+        verify_presence verify_count \
+        aggregate_verify_presence aggregate_verify_count \
+        verify_merge_presence verify_merge_count \
+        filter_presence filter_count \
+        aggregate_filter_presence aggregate_filter_count
+
+verify_merge_presence: stats/verify_merge_presence/current.csv
+verify_merge_count:    stats/verify_merge_count/current.csv
+
+all: aggregate_verify_presence aggregate_verify_count \
+     verify_merge_presence verify_merge_count \
+     aggregate_filter_presence aggregate_filter_count
+
+# ── dependency file ───────────────────────────────────────────────────────────
+
+deps.mk: $(GENOMES)
+	$(VENV_PY) make_deps.py $^ > $@
+
+# ── simulation ────────────────────────────────────────────────────────────────
+# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
+
+$(SIMULATED_READS):
+	bash simulate_one.sh $< $(dir $@)
+
+simulate: $(SIMULATED_READS)
+
+# ── reference kmer sets ───────────────────────────────────────────────────────
+# Prerequisites (reads → npz) are in deps.mk.
+
+reference_index/%.npz:
+	bash build_reference.sh $*
+
+reference: $(REF_NPZS)
+
+# ── per-specimen indexing ─────────────────────────────────────────────────────
+# Prerequisites (reads → index.done + .stats) are in deps.mk.
+
+specimen_index_presence/%/index.done \
+stats/indexing_presence/%.stats &: $(BINARY)
+	bash index_one_presence.sh $*
+
+specimen_index_count/%/index.done \
+stats/indexing_count/%.stats &: $(BINARY)
+	bash index_one_count.sh $*
+
+index_presence: $(PRESENCE_DONE)
+index_count:    $(COUNT_DONE)
+
+# ── indexing stats aggregation ────────────────────────────────────────────────
+
+aggregate_index_presence: $(PRESENCE_STATS)
+	bash aggregate_stats.sh indexing_presence
+
+aggregate_index_count: $(COUNT_STATS)
+	bash aggregate_stats.sh indexing_count
+
+# ── global merge ──────────────────────────────────────────────────────────────
+
+global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
+	bash merge_presence.sh
+
+global_index_count/index.done: $(COUNT_DONE) $(BINARY)
+	bash merge_count.sh
+
+merge_presence: global_index_presence/index.done
+merge_count:    global_index_count/index.done
+
+# ── per-specimen verification ─────────────────────────────────────────────────
+# Prerequisites (index.done + npz → .stats) are in deps.mk.
+
+stats/verify_presence/%.stats:
+	bash verify_one_presence.sh $*
+
+stats/verify_count/%.stats:
+	bash verify_one_count.sh $*
+
+verify_presence: $(VERIFY_PRESENCE_STATS)
+verify_count:    $(VERIFY_COUNT_STATS)
+
+# ── verification stats aggregation ───────────────────────────────────────────
+
+aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
+	bash aggregate_stats.sh verify_presence
+
+aggregate_verify_count: $(VERIFY_COUNT_STATS)
+	bash aggregate_stats.sh verify_count
+
+# ── species-specific indexes ──────────────────────────────────────────────────
+# Prerequisites (global index → specific index) are in deps.mk.
+
+specific_index_presence/%/index.done \
+stats/specific_kmer_presence/%.stats &: $(BINARY)
+	bash filter_one_presence.sh $*
+
+specific_index_count/%/index.done \
+stats/specific_kmer_count/%.stats &: $(BINARY)
+	bash filter_one_count.sh $*
+
+filter_presence: $(SPECIFIC_PRESENCE_DONE)
+filter_count:    $(SPECIFIC_COUNT_DONE)
+
+aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
+	bash aggregate_stats.sh specific_kmer_presence
+
+aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
+	bash aggregate_stats.sh specific_kmer_count
+
+# ── merged index verification ─────────────────────────────────────────────────
+
+stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
+	bash verify_merge_presence.sh
+
+stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
+	bash verify_merge_count.sh
@@ -0,0 +1,132 @@
+# Benchmark pipeline
+
+Requires **GNU Make ≥ 4.3** (grouped targets `&:`).  On macOS use `gmake`.
+
+```
+gmake all          # full pipeline
+gmake simulate     # simulation only
+gmake reference    # reference kmer sets only
+```
+
+## Pipeline overview
+
+```mermaid
+flowchart TD
+    GENOMES["genomes/*.fna.gz"]
+    BIN["obikmer binary"]
+
+    GENOMES --> simulate
+    simulate --> simdata[("simulated_data/")]
+
+    simdata --> reference
+    reference --> refnpz[("reference_index/*.npz")]
+
+    subgraph presence ["Presence track"]
+        simdata  --> index_presence
+        BIN      --> index_presence
+        index_presence --> pres_done[("specimen_index_presence/")]
+        index_presence --> pres_istats[("stats/indexing_presence/")]
+        pres_istats --> aggregate_index_presence
+
+        pres_done --> merge_presence
+        BIN       --> merge_presence
+        merge_presence --> gpres[("global_index_presence/")]
+
+        refnpz    --> verify_presence
+        pres_done --> verify_presence
+        verify_presence --> vpres_stats[("stats/verify_presence/")]
+        vpres_stats --> aggregate_verify_presence
+
+        gpres --> filter_presence
+        BIN   --> filter_presence
+        filter_presence --> spec_pres[("specific_index_presence/")]
+        filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
+        spec_pres_stats --> aggregate_filter_presence
+
+        refnpz --> verify_merge_presence
+        gpres  --> verify_merge_presence
+        verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
+    end
+
+    subgraph count ["Count track"]
+        simdata --> index_count
+        BIN     --> index_count
+        index_count --> count_done[("specimen_index_count/")]
+        index_count --> count_istats[("stats/indexing_count/")]
+        count_istats --> aggregate_index_count
+
+        count_done --> merge_count
+        BIN        --> merge_count
+        merge_count --> gcount[("global_index_count/")]
+
+        refnpz     --> verify_count
+        count_done --> verify_count
+        verify_count --> vcount_stats[("stats/verify_count/")]
+        vcount_stats --> aggregate_verify_count
+
+        gcount --> filter_count
+        BIN    --> filter_count
+        filter_count --> spec_count[("specific_index_count/")]
+        filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
+        spec_count_stats --> aggregate_filter_count
+
+        refnpz --> verify_merge_count
+        gcount --> verify_merge_count
+        verify_merge_count --> vmc[("stats/verify_merge_count/")]
+    end
+
+    aggregate_verify_presence  --> all
+    aggregate_verify_count     --> all
+    vmp                        --> all
+    vmc                        --> all
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_count
+```
+
+## Steps
+
+| Target | Script | Description |
+|---|---|---|
+| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
+| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
+| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
+| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
+| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
+| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
+| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
+| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
+| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
+| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
+| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
+| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
+| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
+| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
+| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
+| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
+| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
+| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
+
+## Directory layout
+
+```
+benchmark/
+├── genomes/                        # input reference genomes (.fna.gz)
+├── simulated_data/                 # generated by simulate
+│   └── <species>/<specimen>/
+├── reference_index/                # reference kmer sets (.npz)
+├── specimen_index_presence/        # per-specimen presence indexes
+├── specimen_index_count/           # per-specimen count indexes
+├── global_index_presence/          # merged global presence index
+├── global_index_count/             # merged global count index
+├── specific_index_presence/        # species-specific presence indexes
+├── specific_index_count/           # species-specific count indexes
+└── stats/                          # all benchmark statistics
+    ├── indexing_presence/
+    ├── indexing_count/
+    ├── verify_presence/
+    ├── verify_count/
+    ├── specific_kmer_presence/
+    ├── specific_kmer_count/
+    ├── verify_merge_presence/
+    └── verify_merge_count/
+```
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Usage: aggregate_stats.sh TYPE
+# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
+#
+# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
+# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
+# the most recent run CSV (idempotent when nothing changed).
+set -euo pipefail
+
+TYPE="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
+
+case "${TYPE}" in
+    indexing_presence|indexing_count)
+        HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
+        ;;
+    verify_presence)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
+        ;;
+    verify_count)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
+        ;;
+    specific_kmer_presence|specific_kmer_count)
+        HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
+        ;;
+    *)
+        echo "ERROR: unknown stats type '${TYPE}'" >&2
+        exit 1
+        ;;
+esac
+
+# Find most recent existing run CSV (empty string if none).
+latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
+
+# Check if any .stats file is newer than the latest run CSV.
+if [[ -n "${latest_csv}" ]] && \
+   [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
+    echo "[${TYPE}] stats up to date (${latest_csv})"
+    exit 0
+fi
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+echo "${HEADER}" >"${CSV}"
+
+# Sort .stats files by name for reproducible row order.
+while IFS= read -r stats_file; do
+    sed "s/^/${run_n},/" "${stats_file}"
+done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
+
+echo "[${TYPE}] run ${run_n} → ${CSV}"
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""Build a reference kmer index from paired-end FASTQ reads.
+
+Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
+counts their abundances, and saves a sorted numpy pair (kmers, counts).
+
+Output .npz arrays
+  kmers  : uint64, sorted ascending — canonical kmer integers
+  counts : uint32, same order      — raw read abundances
+"""
+import argparse
+import gzip
+import sys
+from collections import defaultdict
+
+import numpy as np
+
+
+# ── encoding ────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+# Lookup table: revcomp of one byte (4 bases, 8 bits).
+# Precomputed once at import time.
+_REVCOMP8 = [0] * 256
+for _i in range(256):
+    _rc, _x = 0, _i
+    for _ in range(4):
+        _rc = (_rc << 2) | (3 - (_x & 3))
+        _x >>= 2
+    _REVCOMP8[_i] = _rc
+del _i, _rc, _x
+
+
+def revcomp_int(kmer: int, k: int) -> int:
+    """Reverse-complement of a kmer encoded as an integer (2 bits/base).
+
+    Uses byte-level lookup (4 bases at a time) for speed.
+    """
+    rc = 0
+    bits_left = 2 * k
+    while bits_left > 0:
+        chunk = min(8, bits_left)
+        rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
+        rc = (rc << chunk) | rc_byte
+        kmer >>= chunk
+        bits_left -= chunk
+    return rc
+
+
+# ── FASTQ parsing ────────────────────────────────────────────────────────────
+
+def iter_sequences(path: str):
+    """Yield raw sequences from a (gzipped) FASTQ file."""
+    opener = gzip.open if path.endswith('.gz') else open
+    with opener(path, 'rt') as fh:
+        while True:
+            if not fh.readline():   # '@' header
+                break
+            seq = fh.readline().rstrip('\n')
+            fh.readline()           # '+'
+            fh.readline()           # quality
+            yield seq
+
+
+# ── kmer counting ────────────────────────────────────────────────────────────
+
+def count_kmers(paths: list[str], k: int) -> dict[int, int]:
+    mask = (1 << (2 * k)) - 1
+    counts: dict[int, int] = defaultdict(int)
+    n_reads = 0
+
+    for path in paths:
+        for seq in iter_sequences(path):
+            n_reads += 1
+            kmer = 0
+            run = 0          # consecutive valid bases
+
+            for c in seq:
+                b = _ENCODE.get(c)
+                if b is None:    # N or unexpected character → reset
+                    kmer = 0
+                    run = 0
+                    continue
+                kmer = ((kmer << 2) | b) & mask
+                run += 1
+                if run >= k:
+                    rc = revcomp_int(kmer, k)
+                    counts[kmer if kmer <= rc else rc] += 1
+
+            if n_reads % 100_000 == 0:
+                print(f'  {n_reads:,} reads processed, '
+                      f'{len(counts):,} distinct kmers so far',
+                      file=sys.stderr)
+
+    print(f'  {n_reads:,} reads total, {len(counts):,} distinct kmers',
+          file=sys.stderr)
+    return counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reads', nargs='+', metavar='FASTQ',
+                    help='Input reads (FASTQ, gzip OK)')
+    ap.add_argument('-k', '--kmer-size', type=int, default=31,
+                    metavar='K')
+    ap.add_argument('--min-abundance', type=int, default=1,
+                    metavar='N', help='Drop kmers with count < N (default 1)')
+    ap.add_argument('-o', '--output', required=True,
+                    metavar='FILE', help='Output .npz path')
+    args = ap.parse_args()
+
+    print(f'k={args.kmer_size}  files={len(args.reads)}', file=sys.stderr)
+    counts = count_kmers(args.reads, args.kmer_size)
+
+    if args.min_abundance > 1:
+        before = len(counts)
+        counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
+        print(f'  min-abundance={args.min_abundance}: '
+              f'{before - len(counts):,} kmers dropped, '
+              f'{len(counts):,} retained',
+              file=sys.stderr)
+
+    print(f'Sorting and saving → {args.output}', file=sys.stderr)
+    kmers_arr  = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
+    counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
+
+    np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
+    print(f'Done  {len(kmers_arr):,} kmers  →  {args.output}', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+BUILD_PY="${SCRIPT_DIR}/build_reference.py"
+
+KMER_SIZE="${KMER_SIZE:-31}"
+MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
+
+mkdir -p "${REF_DIR}"
+
+for species_dir in "${SIMDATA_DIR}"/*/; do
+    [[ -d "${species_dir}" ]] || continue
+    species=$(basename "${species_dir}")
+
+    for strain_dir in "${species_dir}"*/; do
+        [[ -d "${strain_dir}" ]] || continue
+        strain=$(basename "${strain_dir}")
+
+        r1="${strain_dir}/reads_R1.fastq.gz"
+        r2="${strain_dir}/reads_R2.fastq.gz"
+        if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+            echo "SKIP ${species}--${strain}: reads not found" >&2
+            continue
+        fi
+
+        out="${REF_DIR}/${species}--${strain}.npz"
+        echo "[${species}--${strain}] → ${out}"
+
+        "${PYTHON}" "${BUILD_PY}" \
+            --kmer-size      "${KMER_SIZE}" \
+            --min-abundance  "${MIN_ABUNDANCE}" \
+            --output         "${out}" \
+            "${r1}" "${r2}"
+    done
+done
@@ -0,0 +1,199 @@
+SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
+SPECIES   := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
+
+# Escherichia_coli--K-12_MG1655
+simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
+stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
+
+# Escherichia_coli--EDL933
+simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
+reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
+stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
+
+# Salmonella_enterica--LT2
+simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
+reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
+stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
+
+# Escherichia_coli--CFT073
+simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
+reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
+stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
+
+# Bacillus_subtilis--168
+simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
+reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
+stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
+
+# Salmonella_enterica--P125109
+simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
+reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
+stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
+
+# Shouchella_clausii--KSM-K16
+simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
+reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
+stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
+
+# Escherichia_coli--K-12_W3110
+simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
+stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
+
+# Klebsiella_pneumoniae--MGH_78578
+simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
+stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
+
+# Opitutus_terrae--PB90-1
+simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
+reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
+stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
+
+# Saccharolobus_islandicus--M.16.4
+simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
+reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
+stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
+
+# Acidobacterium_capsulatum--ATCC_51196
+simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
+reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
+stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
+
+# Salmonella_enterica--AKU_12601
+simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
+reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
+stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
+
+# Proteus_mirabilis--HI4320
+simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
+reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
+stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
+
+# Salmonella_enterica--CT18
+simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
+reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
+stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
+
+# Klebsiella_pneumoniae--HS11286
+simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
+stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
+
+# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
+simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
+reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+
+# Klebsiella_pneumoniae--ATCC_13883
+simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
+stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
+
+# Yersinia_ruckeri--YRB
+simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
+reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
+stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
+
+# Candidozyma_auris--GCF_003013715.1_ASM301371v2
+simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
+reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+
+# Escherichia_coli
+specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
+specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
+# Salmonella_enterica
+specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
+specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
+# Bacillus_subtilis
+specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
+specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
+# Shouchella_clausii
+specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
+specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
+# Klebsiella_pneumoniae
+specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
+specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
+# Opitutus_terrae
+specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
+specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
+# Saccharolobus_islandicus
+specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
+specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
+# Acidobacterium_capsulatum
+specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
+specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
+# Proteus_mirabilis
+specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
+specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
+# Wolbachia_endosymbiont
+specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
+specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
+# Yersinia_ruckeri
+specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
+specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
+# Candidozyma_auris
+specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
+specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+assemblies=(
+    GCF_000005845.2
+    GCF_000010245.2
+    GCF_000007445.1
+    GCF_000006665.1
+
+    GCF_000006945.2
+    GCF_000195995.1
+    GCF_000009505.1
+    GCF_000026565.1
+
+    GCF_000016305.1
+    GCF_000019965.1
+    GCF_000240185.1
+    GCF_000742135.1
+
+    GCF_000069965.1
+    GCF_000022565.1
+    GCF_000306885.1
+    GCF_003013715.1
+
+    GCF_000009045.1
+    GCF_000009825.1
+    GCF_000022445.1
+    GCF_000834255.1
+)
+
+mkdir -p genomes
+
+for acc in "${assemblies[@]}"; do
+    echo "Downloading ${acc}"
+
+    datasets download genome accession "${acc}" \
+        --include genome \
+        --filename "${acc}.zip"
+
+    unzip -q "${acc}.zip" -d "${acc}"
+    find "${acc}" -name "*.fna" |
+        while read file; do
+            obiconvert -Z ${file} >genomes/$(basename ${file}).gz
+        done
+
+    rm -rf "${acc}" "${acc}.zip"
+done
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_count.sh SPECIES
+# Filters global_index_count to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_count/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_count/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_count"
+OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (count) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_presence.sh SPECIES
+# Filters global_index_presence to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_presence/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_presence/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_presence"
+OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Usage: index_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_count/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_count/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --with-counts \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Usage: index_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_presence/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_presence/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
+
+Like C .d files: only target: prerequisites lines, no recipes.
+Recipes stay in the Makefile as generic rules.
+"""
+import gzip
+import re
+import sys
+from pathlib import Path
+
+STOP_WORDS    = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
+                 'endosymbiont', 'of'}
+STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
+
+
+def is_stop(tok):
+    t = tok.lower()
+    return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
+
+
+def sanitize(s):
+    return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
+
+
+def collect_tokens(text):
+    parts = []
+    for tok in text.split():
+        tok = tok.rstrip(',.')
+        if is_stop(tok):
+            break
+        parts.append(sanitize(tok))
+    return '_'.join(filter(None, parts))
+
+
+def parse_organism(defn, gcf_id):
+    words   = defn.split()
+    species = sanitize(words[0] + '_' + words[1])
+
+    m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
+    if m:
+        strain = sanitize(m.group(1))
+        if m.group(2):
+            strain += '_' + sanitize(m.group(2))
+        return species, strain
+
+    m = re.search(r'\bstrain\b\s+(.*)', defn)
+    if m:
+        strain = collect_tokens(m.group(1))
+        if strain:
+            return species, strain
+
+    remainder = re.sub(r'^\S+ \S+\s*', '', defn)
+    remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
+    remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
+    strain    = collect_tokens(remainder)
+    return species, strain if strain else gcf_id
+
+
+def first_definition(path):
+    with gzip.open(path, 'rt') as fh:
+        for line in fh:
+            if line.startswith('>'):
+                m = re.search(r'"definition":"([^"]*)"', line)
+                return m.group(1) if m else line[1:].split()[0]
+    return Path(path).stem
+
+
+def main():
+    entries = []   # (specimen, species, sim_dir, genome_path)
+    species_seen = []
+
+    for path in sorted(sys.argv[1:]):
+        gcf_id  = Path(path).name.replace('_genomic.fna.gz', '')
+        defn    = first_definition(path)
+        sp, st  = parse_organism(defn, gcf_id)
+        specimen = f'{sp}--{st}'
+        sim_dir  = f'simulated_data/{sp}/{st}'
+        entries.append((specimen, sp, sim_dir, path))
+        if sp not in species_seen:
+            species_seen.append(sp)
+
+    specimens = [e[0] for e in entries]
+    print('SPECIMENS :=', ' '.join(specimens))
+    print('SPECIES   :=', ' '.join(species_seen))
+
+    for specimen, species, sim_dir, genome in entries:
+        reads = f'{sim_dir}/reads_R1.fastq.gz'
+        p_done  = f'specimen_index_presence/{specimen}/index.done'
+        p_stats = f'stats/indexing_presence/{specimen}.stats'
+        c_done  = f'specimen_index_count/{specimen}/index.done'
+        c_stats = f'stats/indexing_count/{specimen}.stats'
+        ref     = f'reference_index/{specimen}.npz'
+        vp      = f'stats/verify_presence/{specimen}.stats'
+        vc      = f'stats/verify_count/{specimen}.stats'
+
+        print()
+        print(f'# {specimen}')
+        print(f'{reads}: {genome}')
+        print(f'{ref}: {reads}')
+        print(f'{p_done} {p_stats}: {reads}')
+        print(f'{c_done} {c_stats}: {reads}')
+        print(f'{vp}: {ref} {p_done}')
+        print(f'{vc}: {ref} {c_done}')
+
+    print()
+    for sp in species_seen:
+        sp_done  = f'specific_index_presence/{sp}/index.done'
+        sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
+        sc_done  = f'specific_index_count/{sp}/index.done'
+        sc_stats = f'stats/specific_kmer_count/{sp}.stats'
+        print(f'# {sp}')
+        print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
+        print(f'{sc_done} {sc_stats}: global_index_count/index.done')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
+OUTPUT="${SCRIPT_DIR}/global_index_count"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output  "${OUTPUT}" \
+    --force \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
+OUTPUT="${SCRIPT_DIR}/global_index_presence"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output          "${OUTPUT}" \
+    --force \
+    --force-presence \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# Simulate all genomes. Delegates to simulate_one.sh per genome.
+# Prefer running via `gmake simulate` which handles individual dependencies.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
+    out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
+        --dir-for "${genome_file}")
+    bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
+done
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Usage: simulate_one.sh genome.fna.gz output_dir
+# Simulates paired-end HiSeq reads for a single genome.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ISS="${SCRIPT_DIR}/../.venv/bin/iss"
+COVERAGE=15
+READ_LENGTH=150
+CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
+
+genome_file="$1"
+out_dir="$2"
+
+mkdir -p "${out_dir}"
+
+tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
+trap 'rm -f "${tmp_fasta}"' EXIT
+
+gzip -dc "${genome_file}" > "${tmp_fasta}"
+
+genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
+n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
+
+echo "[${out_dir}]  genome=${genome_size} bp  →  ${n_reads} read pairs  (${COVERAGE}x HiSeq)"
+
+"${ISS}" generate \
+    --genomes   "${tmp_fasta}" \
+    --model     HiSeq \
+    --n_reads   "${n_reads}" \
+    --cpus      "${CPUS}" \
+    --compress \
+    --output    "${out_dir}/reads"
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""Compare an obikmer count index against a reference kmer set (presence + counts).
+
+Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
+streams `obikmer dump` from a --with-counts index, then reports:
+  - false negatives : kmers in reference absent from the index
+  - false positives : kmers in the index absent from the reference
+  - count mismatches: kmers present in both but with differing counts
+
+Output to stdout: one CSV row
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
+    """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers, counts = [], []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        parts = line.rstrip('\n').split(',')
+        kmers.append(encode_kmer(parts[0]))
+        counts.append(int(parts[1]))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
+    return (np.array(kmers, dtype=np.uint64)[order],
+            np.array(counts, dtype=np.uint32)[order])
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
+            idx_kmers: np.ndarray, idx_counts: np.ndarray,
+            ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
+
+    All arrays sorted; cm_* cover kmers present in both arrays but with
+    differing counts.
+    """
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers.
+    # Both arrays are sorted so we can use searchsorted.
+    pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask = idx_kmers[pos_in_idx] == ref_kmers
+
+    shared_ref_counts = ref_counts[shared_mask]
+    shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
+    mismatch_mask     = shared_ref_counts != shared_idx_counts
+
+    cm_kmers      = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref_counts = shared_ref_counts[mismatch_mask]
+    cm_idx_counts = shared_idx_counts[mismatch_mask]
+
+    return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?',
+                    help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?',
+                    help='obikmer index directory (built with --with-counts)')
+    ap.add_argument('--obikmer',  default='obikmer',
+                    help='Path to obikmer binary')
+    ap.add_argument('--species',  default='')
+    ap.add_argument('--strain',   default='')
+    ap.add_argument('--header',   action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    ap.add_argument('--save-cm',  metavar='FILE',
+                    help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    # Detect k
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers, idx_counts = load_index(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
+        ref_kmers, ref_counts, idx_kmers, idx_counts)
+
+    n_shared  = len(ref_kmers) - len(false_neg)
+    fn_pct    = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct    = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct    = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'false negatives : {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives : {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+    print(f'count mismatches: {len(cm_kmers):,}  ({cm_pct:.4f}% of shared)',
+          file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_cm and len(cm_kmers):
+        with open(args.save_cm, 'w') as fh:
+            fh.write('kmer,ref_count,idx_count\n')
+            for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+                fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+          f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Verify the merged count index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer+count pairs from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_count.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order
+        per_specimen   : mapping label → (kmer_ints, counts) for entries > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]
+    per_specimen: dict[str, tuple[list[int], list[int]]] = {
+        name: ([], []) for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            count = int(parts[i + 1])
+            if count > 0:
+                per_specimen[name][0].append(kmer_int)
+                per_specimen[name][1].append(count)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     count_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     save_cm: Path | None,
+                     ) -> str:
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    npz        = np.load(ref_path)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    order      = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
+    idx_kmers  = np.array(kmer_list,  dtype=np.uint64)[order]
+    idx_counts = np.array(count_list, dtype=np.uint32)[order]
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers
+    pos_in_idx     = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx     = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask    = idx_kmers[pos_in_idx] == ref_kmers
+    mismatch_mask  = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
+    cm_kmers       = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref         = ref_counts[shared_mask][mismatch_mask]
+    cm_idx         = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
+
+    n_shared = int(shared_mask.sum())
+    fn_pct   = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct   = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct   = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)  '
+          f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    if save_cm and len(cm_kmers):
+        cm_file = save_cm / f'{name}_cm.csv'
+        lines = ['kmer,ref_count,idx_count']
+        for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+            lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
+        cm_file.write_text('\n'.join(lines) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+            f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged count index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory for false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory for false-positive kmer lists')
+    ap.add_argument('--save-cm', metavar='DIR',
+                    help='Directory for count-mismatch CSV files')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    ref_dir = Path(args.ref_dir)
+    save_fn = Path(args.save_fn) if args.save_fn else None
+    save_fp = Path(args.save_fp) if args.save_fp else None
+    save_cm = Path(args.save_cm) if args.save_cm else None
+    for d in (save_fn, save_fp, save_cm):
+        if d: d.mkdir(parents=True, exist_ok=True)
+
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        kmers, counts = per_specimen[name]
+        row = compare_specimen(name, kmers, counts, ref_dir, k,
+                               save_fn, save_fp, save_cm)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_count"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Verify the merged presence index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer sets from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, list[int]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order (excluding 'kmer')
+        per_specimen   : mapping label → list of kmer ints where presence > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]           # first col is 'kmer'
+    per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            if int(parts[i + 1]) > 0:
+                per_specimen[name].append(kmer_int)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     ) -> str:
+    """Compare one specimen column against its reference .npz.
+
+    Returns a CSV row string.
+    """
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    ref_kmers = np.load(ref_path)['kmers']          # sorted uint64
+    idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},'
+            f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged presence index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory to save false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory to save false-positive kmer lists')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    ref_dir  = Path(args.ref_dir)
+    save_fn  = Path(args.save_fn) if args.save_fn else None
+    save_fp  = Path(args.save_fp) if args.save_fp else None
+    if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
+    if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
+
+    # Detect k
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_presence"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying count"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying presence"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""Compare an obikmer index against a reference kmer set (presence/absence).
+
+Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
+streams the output of `obikmer dump`, encodes each kmer string to uint64,
+then reports false negatives and false positives using numpy set operations.
+
+Output to stdout: one CSV row
+  species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
+    """Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers = []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        kmer_str = line.split(',', 1)[0]
+        kmers.append(encode_kmer(kmer_str))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    arr = np.array(kmers, dtype=np.uint64)
+    arr.sort()
+    return arr
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Return (false_negatives, false_positives) as uint64 arrays."""
+    false_neg = np.setdiff1d(ref, idx, assume_unique=True)
+    false_pos = np.setdiff1d(idx, ref, assume_unique=True)
+    return false_neg, false_pos
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?', help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
+    ap.add_argument('--obikmer',  default='obikmer',   help='Path to obikmer binary')
+    ap.add_argument('--species',  default='',          help='Species label for CSV row')
+    ap.add_argument('--strain',   default='',          help='Strain label for CSV row')
+    ap.add_argument('--header',   action='store_true', help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    # Detect k from the index (one cheap call before the full dump).
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers = npz['kmers']          # already sorted uint64
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers = load_index_kmers(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos = compare(ref_kmers, idx_kmers)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'false negatives: {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives: {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False positives saved → {args.save_fp}', file=sys.stderr)
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},'
+          f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
@@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate.

 ### Path matching (`~` and `!~`)

-Metadata values can represent hierarchical taxonomic paths such as
+Metadata values can represent hierarchical concept paths such as
 `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.

- **Absolute pattern** (starts with `/`): the value must start with the pattern
-  at a segment boundary.
-  `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
+**Both the stored metadata value and the pattern must start with `/`.**
+A pattern that does not start with `/` is rejected at parse time with an error.
+
+The value matches the pattern if it equals it exactly or starts with the pattern
+followed by `/` (segment-boundary prefix):
+
+- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
  `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
- **Bare segment** (no leading `/`): the value must contain the pattern as an
-  exact path component anywhere.
-  `taxon~Betula` matches any path that has `Betula` as one of its segments.

 ### Missing metadata key → NA

@@ -53,6 +53,7 @@ nav:
      - Merge parallelism & memory: implementation/merge_parallelism.md
      - Kmer filtering: implementation/filtering.md
      - Select command: implementation/select.md
+      - obitaxonomy crate: implementation/obitaxonomy.md
  - Architecture:
      - Sequences: architecture/sequences/invariant.md
      - Kmer index: architecture/index_architecture.md
@@ -1853,6 +1853,10 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "obitaxonomy"
+version = "0.1.0"
+
 [[package]]
 name = "object"
 version = "0.37.3"
@@ -1,5 +1,5 @@
 [workspace]
 resolver = "3"
-members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
+members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
 [profile.release]
 debug = 1
@@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
 // ── BitIter ───────────────────────────────────────────────────────────────────

 pub struct BitIter<'a> {
-    pub(crate) words: &'a [u64],
-    pub(crate) slot:  usize,
-    pub(crate) n:     usize,
+    words: &'a [u64],
+    slot:  usize,
+    n:     usize,
 }

 impl ExactSizeIterator for BitIter<'_> {}
@@ -132,7 +132,7 @@ impl PersistentBitVecBuilder {
        Ok(Self { mmap, n, path: path.to_path_buf() })
    }

-    pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
+    pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
        let file_size = HEADER_SIZE + n_bytes_for_words(n);
        let file = OpenOptions::new()
            .read(true).write(true).create(true).truncate(true)
@@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder;
 pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use reader::PersistentCompactIntVec;
-pub use tempbitvec::TempBitVec;
-pub use tempintvec::TempCompactIntVec;
+pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
+pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
+pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 pub use traits::{BitPartials, ColumnWeights, CountPartials};
-pub use views::{BitSliceView, IntSliceView};
+pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};

 #[cfg(test)]
 #[path = "tests/mod.rs"]
@@ -43,27 +43,27 @@ impl TempBitVec {

 // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────

-pub(crate) struct TempBitVecBuilder {
+pub struct TempBitVecBuilder {
    builder: PersistentBitVecBuilder,
    temp: TempDir,
 }

 impl TempBitVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pbiv");
        let builder = PersistentBitVecBuilder::new(n, &path)?;
        Ok(Self { builder, temp })
    }

-    pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
+    pub fn new_ones(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pbiv");
        let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
        Ok(Self { builder, temp })
    }

-    pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
+    pub fn freeze(self) -> io::Result<TempBitVec> {
        let Self { builder, temp } = self;
        let vec = builder.finish()?;
        Ok(TempBitVec { vec, _temp: temp })
@@ -72,7 +72,8 @@ impl TempBitVecBuilder {
    pub fn set(&mut self, slot: usize, value: bool) {
        self.builder.set(slot, value);
    }
-    pub(crate) fn view(&self) -> BitSliceView<'_> {
+
+    pub fn view(&self) -> BitSliceView<'_> {
        self.builder.view()
    }

@@ -80,19 +81,19 @@ impl TempBitVecBuilder {
        self.builder.or(other);
    }

-    pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
+    pub fn and(&mut self, other: BitSliceView<'_>) {
        self.builder.and(other);
    }

-    pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
        self.builder.xor(other);
    }

-    pub(crate) fn not(&mut self) {
+    pub fn not(&mut self) {
        self.builder.not();
    }

-    pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
        self.builder.copy_from(src);
    }

@@ -100,11 +101,11 @@ impl TempBitVecBuilder {
        self.builder.or_where(col, pred);
    }

-    pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.and_where(col, pred);
    }

-    pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.xor_where(col, pred);
    }
 }
@@ -32,60 +32,58 @@ impl TempCompactIntVec {

 // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──

-pub(crate) struct TempCompactIntVecBuilder {
+pub struct TempCompactIntVecBuilder {
    builder: PersistentCompactIntVecBuilder,
    temp:    TempDir,
 }

 impl TempCompactIntVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pciv");
        let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
        Ok(Self { builder, temp })
    }

-    pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
+    pub fn freeze(self) -> io::Result<TempCompactIntVec> {
        let Self { builder, temp } = self;
        let vec = builder.finish()?;
        Ok(TempCompactIntVec { vec, _temp: temp })
    }

-    // ── Delegation methods ────────────────────────────────────────────────────
+    pub fn n(&self) -> usize { self.builder.len() }

-    pub(crate) fn n(&self) -> usize { self.builder.len() }
+    pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    pub fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }

-    pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
-    pub(crate) fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
+    pub fn primary_bytes(&self)         -> &[u8]      { self.builder.primary_bytes() }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8]  { self.builder.primary_bytes_mut() }

-    pub(crate) fn primary_bytes(&self)       -> &[u8]      { self.builder.primary_bytes() }
-    pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
-
-    pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
        self.builder.inc_present(col);
    }

-    pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
        self.builder.inc_present_fast(col);
    }

-    pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.inc_predicate(col, pred);
    }

-    pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.inc_predicate_fast(col, pred);
    }

-    pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
+    pub fn add(&mut self, other: IntSliceView<'_>) {
        self.builder.add(other);
    }

-    pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
        self.builder.mask_with(mask);
    }

-    pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
-    pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
-    pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
+    pub fn min(&mut self, other: IntSliceView<'_>)  { self.builder.min(other); }
+    pub fn max(&mut self, other: IntSliceView<'_>)  { self.builder.max(other); }
+    pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
 }
@@ -3,6 +3,7 @@ use crossbeam_channel;
 use hashbrown::HashMap;
 use obikseq::k;
 use obikseq::{CanonicalKmer, Sequence, Unitig};
+#[cfg(not(any(test, feature = "test-utils")))]
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use std::cell::RefCell;
 use std::fmt;
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
 use crate::error::{OKIError, OKIResult};
 use crate::index::KmerIndex;
 use crate::meta::{GenomeInfo, IndexMeta};
-use crate::state::IndexState;
+use crate::state::{IndexState, SENTINEL_INDEXED};

 pub use obikpartitionner::MergeMode;

@@ -263,6 +263,8 @@ impl KmerIndex {
            rep.push(t.stop());
        }

+        fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
+
        KmerIndex::open(output)
    }
 }
@@ -49,6 +49,11 @@ impl MetaPred {
        if values.iter().any(|v| v.is_empty()) {
            return Err(format!("empty value in predicate: {s}"));
        }
+        if matches!(op, PredOp::Matches | PredOp::NotMatches) {
+            if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
+                return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
+            }
+        }

        Ok(Self { key, op, values })
    }
@@ -72,16 +77,12 @@ impl MetaPred {

 /// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
 ///
-/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
-/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
+/// Both `value` and `pattern` must start with `/`.
+/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
 fn path_matches(value: &str, pattern: &str) -> bool {
-    if pattern.starts_with('/') {
-        value == pattern
-            || (value.starts_with(pattern)
-                && value[pattern.len()..].starts_with('/'))
-    } else {
-        value.split('/').any(|seg| seg == pattern)
-    }
+    value == pattern
+        || (value.starts_with(pattern)
+            && value[pattern.len()..].starts_with('/'))
 }

 // ── Three-value group evaluation ──────────────────────────────────────────────
@@ -1,28 +0,0 @@
->F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
-GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
->C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
-AAAAAAATTAAGTCCCATATTGAAACATTAT
->9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
-AAAAAATTAAGTCCCATATTGAAACATTATC
->74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
-AAAAATTAAGTCCCATATTGAAACATTATCA
->45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
-AAAATTAAGTCCCATATTGAAACATTATCAC
->5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
-AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
->8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
-AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
->C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
-AATGTGAGTTGTTAATATTACATAATTGGGTT
->933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
-TGTGAGTTGTTAATATTACATAATTGGGTTT
->3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
-GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
->1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
-TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
->8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
-AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
->247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
-CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
->11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
-GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG