2026-06-22 08:47:24 +00:00
42 changed files with 2585 additions and 84 deletions
@@ -9,3 +9,13 @@ data-stress
 ./**/*.json
 *.bin
 Betula_exilis--IGA-24-33
 benchmark/genomes
 benchmark/simulated_data
 benchmark/specimen_index_presence
 benchmark/specimen_index_count
 benchmark/global_index_presence
 benchmark/global_index_count
 benchmark/stats
 benchmark/reference_index
 benchmark/specific_index_count
 benchmark/specific_index_presence
@@ -0,0 +1,2 @@
 /cache
 /project.local.yml
@@ -0,0 +1,133 @@
 # the name by which the project can be referenced within Serena
 project_name: "obikmer"
 # list of languages for which language servers are started; choose from:
 #   al                  angular             ansible             bash                clojure
 #   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
 #   dart                elixir              elm                 erlang              fortran
 #   fsharp              go                  groovy              haskell             haxe
 #   hlsl                html                java                json                julia
 #   kotlin              lean4               lua                 luau                markdown
 #   matlab              msl                 nix                 ocaml               pascal
 #   perl                php                 php_phpactor        powershell          python
 #   python_jedi         python_ty           r                   rego                ruby
 #   ruby_solargraph     rust                scala               scss                solidity
 #   svelte              swift               systemverilog       terraform           toml
 #   typescript          typescript_vts      vue                 yaml                zig
 #   (This list may be outdated. For the current list, see values of Language enum here:
 #   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
 #   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
 # Note:
 #   - For C, use cpp
 #   - For JavaScript, use typescript
 #   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
 #   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
 #   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
 #   - For Free Pascal/Lazarus, use pascal
 # Special requirements:
 #   Some languages require additional setup/installations.
 #   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
 # When using multiple languages, the first language server that supports a given file will be used for that file.
 # The first language is the default language and the respective language server will be used as a fallback.
 # Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
 languages:
 - rust
 # the encoding used by text files in the project
 # For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
 encoding: "utf-8"
 # line ending convention to use when writing source files.
 # Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
 # This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
 line_ending:
 # The language backend to use for this project.
 # If not set, the global setting from serena_config.yml is used.
 # Valid values: LSP, JetBrains
 # Note: the backend is fixed at startup. If a project with a different backend
 # is activated post-init, an error will be returned.
 language_backend:
 # whether to use project's .gitignore files to ignore files
 ignore_all_files_in_gitignore: true
 # advanced configuration option allowing to configure language server-specific options.
 # Maps the language key to the options.
 # Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
 # No documentation on options means no options are available.
 ls_specific_settings: {}
 # list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
 # Paths can be absolute or relative to the project root.
 # Each folder is registered as an LSP workspace folder, enabling language servers to discover
 # symbols and references across package boundaries.
 # Currently supported for: TypeScript.
 # Example:
 #   additional_workspace_folders:
 #     - ../sibling-package
 #     - ../shared-lib
 additional_workspace_folders: []
 # list of additional paths to ignore in this project.
 # Same syntax as gitignore, so you can use * and **.
 # Note: global ignored_paths from serena_config.yml are also applied additively.
 ignored_paths: []
 # whether the project is in read-only mode
 # If set to true, all editing tools will be disabled and attempts to use them will result in an error
 # Added on 2025-04-18
 read_only: false
 # list of tool names to exclude.
 # This extends the existing exclusions (e.g. from the global configuration)
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 excluded_tools: []
 # list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
 # This extends the existing inclusions (e.g. from the global configuration).
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 included_optional_tools: []
 # fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
 # This cannot be combined with non-empty excluded_tools or included_optional_tools.
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 fixed_tools: []
 # list of mode names that are to be activated by default, overriding the setting in the global configuration.
 # The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
 # If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
 # Otherwise, this overrides the setting from the global configuration (serena_config.yml).
 # Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
 # for this project.
 # This setting can, in turn, be overridden by CLI parameters (--mode).
 # See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
 default_modes:
 # list of mode names to be activated additionally for this project, e.g. ["query-projects"]
 # The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
 # See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
 added_modes:
 # initial prompt for the project. It will always be given to the LLM upon activating the project
 # (contrary to the memories, which are loaded on demand).
 initial_prompt: ""
 # time budget (seconds) per tool call for the retrieval of additional symbol information
 # such as docstrings or parameter information.
 # This overrides the corresponding setting in the global configuration; see the documentation there.
 # If null or missing, use the setting from the global configuration.
 symbol_info_budget:
 # list of regex patterns which, when matched, mark a memory entry as read‑only.
 # Extends the list from the global configuration, merging the two lists.
 read_only_memory_patterns: []
 # list of regex patterns for memories to completely ignore.
 # Matching memories will not appear in list_memories or activate_project output
 # and cannot be accessed via read_memory or write_memory.
 # To access ignored memory files, use the read_file tool on the raw file path.
 # Extends the list from the global configuration, merging the two lists.
 # Example: ["_archive/.*", "_episodes/.*"]
 ignored_memory_patterns: []
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
 ---
 Je continue à poser mes questions et à guider la discussion.
 ---
 ## MCP Tools
 **Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
 ### Hiérarchie des outils pour ce projet Rust
 **Navigation et édition de code → serena en priorité**
 - Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
 - Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
 - Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
 - Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
 - Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
 - Ne pas utiliser `cclsp` quand serena couvre le besoin
 **Analyse architecturale → jcodemunch**
 - Hotspots, couplage, dead code, dépendances entre modules
 - Utiliser avant de refactorer une zone critique
 **Raisonnement complexe → sequential-thinking**
 - Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
 **Documentation de crates → context7**
 - Toujours consulter avant d'utiliser une API de bibliothèque externe
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
 		mkdocs mkdocs-material \
 		mkdocs-mermaid2-plugin \
 		mkdocs-bibtex
 	$(PIP) install --quiet --upgrade InSilicoSeq
 # ── obikmer binary ───────────────────────────────────────────────────────────
@@ -0,0 +1,144 @@
 # Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
 BINARY  := ../src/target/release/obikmer
 VENV_PY := ../.venv/bin/python3
 GENOMES := $(wildcard genomes/*.fna.gz)
 # SPECIMENS, SPECIES, and the full dependency graph are generated by
 # make_deps.py from the genome FASTA headers — like .d files in C.
 # Make rebuilds deps.mk whenever genomes/ changes and restarts.
 -include deps.mk
 REF_NPZS              := $(SPECIMENS:%=reference_index/%.npz)
 PRESENCE_DONE         := $(SPECIMENS:%=specimen_index_presence/%/index.done)
 PRESENCE_STATS        := $(SPECIMENS:%=stats/indexing_presence/%.stats)
 COUNT_DONE            := $(SPECIMENS:%=specimen_index_count/%/index.done)
 COUNT_STATS           := $(SPECIMENS:%=stats/indexing_count/%.stats)
 VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
 VERIFY_COUNT_STATS    := $(SPECIMENS:%=stats/verify_count/%.stats)
 SPECIFIC_PRESENCE_DONE  := $(SPECIES:%=specific_index_presence/%/index.done)
 SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
 SPECIFIC_COUNT_DONE     := $(SPECIES:%=specific_index_count/%/index.done)
 SPECIFIC_COUNT_STATS    := $(SPECIES:%=stats/specific_kmer_count/%.stats)
 SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
 .NOTPARALLEL:
 .PHONY: all simulate reference \
        index_presence index_count \
        aggregate_index_presence aggregate_index_count \
        merge_presence merge_count \
        verify_presence verify_count \
        aggregate_verify_presence aggregate_verify_count \
        verify_merge_presence verify_merge_count \
        filter_presence filter_count \
        aggregate_filter_presence aggregate_filter_count
 verify_merge_presence: stats/verify_merge_presence/current.csv
 verify_merge_count:    stats/verify_merge_count/current.csv
 all: aggregate_verify_presence aggregate_verify_count \
     verify_merge_presence verify_merge_count \
     aggregate_filter_presence aggregate_filter_count
 # ── dependency file ───────────────────────────────────────────────────────────
 deps.mk: $(GENOMES)
 	$(VENV_PY) make_deps.py $^ > $@
 # ── simulation ────────────────────────────────────────────────────────────────
 # Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
 $(SIMULATED_READS):
 	bash simulate_one.sh $< $(dir $@)
 simulate: $(SIMULATED_READS)
 # ── reference kmer sets ───────────────────────────────────────────────────────
 # Prerequisites (reads → npz) are in deps.mk.
 reference_index/%.npz:
 	bash build_reference.sh $*
 reference: $(REF_NPZS)
 # ── per-specimen indexing ─────────────────────────────────────────────────────
 # Prerequisites (reads → index.done + .stats) are in deps.mk.
 specimen_index_presence/%/index.done \
 stats/indexing_presence/%.stats &: $(BINARY)
 	bash index_one_presence.sh $*
 specimen_index_count/%/index.done \
 stats/indexing_count/%.stats &: $(BINARY)
 	bash index_one_count.sh $*
 index_presence: $(PRESENCE_DONE)
 index_count:    $(COUNT_DONE)
 # ── indexing stats aggregation ────────────────────────────────────────────────
 aggregate_index_presence: $(PRESENCE_STATS)
 	bash aggregate_stats.sh indexing_presence
 aggregate_index_count: $(COUNT_STATS)
 	bash aggregate_stats.sh indexing_count
 # ── global merge ──────────────────────────────────────────────────────────────
 global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
 	bash merge_presence.sh
 global_index_count/index.done: $(COUNT_DONE) $(BINARY)
 	bash merge_count.sh
 merge_presence: global_index_presence/index.done
 merge_count:    global_index_count/index.done
 # ── per-specimen verification ─────────────────────────────────────────────────
 # Prerequisites (index.done + npz → .stats) are in deps.mk.
 stats/verify_presence/%.stats:
 	bash verify_one_presence.sh $*
 stats/verify_count/%.stats:
 	bash verify_one_count.sh $*
 verify_presence: $(VERIFY_PRESENCE_STATS)
 verify_count:    $(VERIFY_COUNT_STATS)
 # ── verification stats aggregation ───────────────────────────────────────────
 aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
 	bash aggregate_stats.sh verify_presence
 aggregate_verify_count: $(VERIFY_COUNT_STATS)
 	bash aggregate_stats.sh verify_count
 # ── species-specific indexes ──────────────────────────────────────────────────
 # Prerequisites (global index → specific index) are in deps.mk.
 specific_index_presence/%/index.done \
 stats/specific_kmer_presence/%.stats &: $(BINARY)
 	bash filter_one_presence.sh $*
 specific_index_count/%/index.done \
 stats/specific_kmer_count/%.stats &: $(BINARY)
 	bash filter_one_count.sh $*
 filter_presence: $(SPECIFIC_PRESENCE_DONE)
 filter_count:    $(SPECIFIC_COUNT_DONE)
 aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
 	bash aggregate_stats.sh specific_kmer_presence
 aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
 	bash aggregate_stats.sh specific_kmer_count
 # ── merged index verification ─────────────────────────────────────────────────
 stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
 	bash verify_merge_presence.sh
 stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
 	bash verify_merge_count.sh
@@ -0,0 +1,132 @@
 # Benchmark pipeline
 Requires **GNU Make ≥ 4.3** (grouped targets `&:`).  On macOS use `gmake`.
 ```
 gmake all          # full pipeline
 gmake simulate     # simulation only
 gmake reference    # reference kmer sets only
 ```
 ## Pipeline overview
 ```mermaid
 flowchart TD
    GENOMES["genomes/*.fna.gz"]
    BIN["obikmer binary"]
    GENOMES --> simulate
    simulate --> simdata[("simulated_data/")]
    simdata --> reference
    reference --> refnpz[("reference_index/*.npz")]
    subgraph presence ["Presence track"]
        simdata  --> index_presence
        BIN      --> index_presence
        index_presence --> pres_done[("specimen_index_presence/")]
        index_presence --> pres_istats[("stats/indexing_presence/")]
        pres_istats --> aggregate_index_presence
        pres_done --> merge_presence
        BIN       --> merge_presence
        merge_presence --> gpres[("global_index_presence/")]
        refnpz    --> verify_presence
        pres_done --> verify_presence
        verify_presence --> vpres_stats[("stats/verify_presence/")]
        vpres_stats --> aggregate_verify_presence
        gpres --> filter_presence
        BIN   --> filter_presence
        filter_presence --> spec_pres[("specific_index_presence/")]
        filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
        spec_pres_stats --> aggregate_filter_presence
        refnpz --> verify_merge_presence
        gpres  --> verify_merge_presence
        verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
    end
    subgraph count ["Count track"]
        simdata --> index_count
        BIN     --> index_count
        index_count --> count_done[("specimen_index_count/")]
        index_count --> count_istats[("stats/indexing_count/")]
        count_istats --> aggregate_index_count
        count_done --> merge_count
        BIN        --> merge_count
        merge_count --> gcount[("global_index_count/")]
        refnpz     --> verify_count
        count_done --> verify_count
        verify_count --> vcount_stats[("stats/verify_count/")]
        vcount_stats --> aggregate_verify_count
        gcount --> filter_count
        BIN    --> filter_count
        filter_count --> spec_count[("specific_index_count/")]
        filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
        spec_count_stats --> aggregate_filter_count
        refnpz --> verify_merge_count
        gcount --> verify_merge_count
        verify_merge_count --> vmc[("stats/verify_merge_count/")]
    end
    aggregate_verify_presence  --> all
    aggregate_verify_count     --> all
    vmp                        --> all
    vmc                        --> all
    all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
    all -. "$(MAKE) re-eval" .-> aggregate_filter_count
 ```
 ## Steps
 | Target | Script | Description |
 |---|---|---|
 | `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
 | `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
 | `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
 | `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
 | `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
 | `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
 | `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
 | `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
 | `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
 | `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
 | `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
 | `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
 | `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
 | `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
 | `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
 | `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
 | `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
 | `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
 ## Directory layout
 ```
 benchmark/
 ├── genomes/                        # input reference genomes (.fna.gz)
 ├── simulated_data/                 # generated by simulate
 │   └── <species>/<specimen>/
 ├── reference_index/                # reference kmer sets (.npz)
 ├── specimen_index_presence/        # per-specimen presence indexes
 ├── specimen_index_count/           # per-specimen count indexes
 ├── global_index_presence/          # merged global presence index
 ├── global_index_count/             # merged global count index
 ├── specific_index_presence/        # species-specific presence indexes
 ├── specific_index_count/           # species-specific count indexes
 └── stats/                          # all benchmark statistics
    ├── indexing_presence/
    ├── indexing_count/
    ├── verify_presence/
    ├── verify_count/
    ├── specific_kmer_presence/
    ├── specific_kmer_count/
    ├── verify_merge_presence/
    └── verify_merge_count/
 ```
@@ -0,0 +1,53 @@
 #!/usr/bin/env bash
 # Usage: aggregate_stats.sh TYPE
 # TYPE = indexing_presence | indexing_count | verify_presence | verify_count
 #
 # Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
 # Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
 # the most recent run CSV (idempotent when nothing changed).
 set -euo pipefail
 TYPE="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
 case "${TYPE}" in
    indexing_presence|indexing_count)
        HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
        ;;
    verify_presence)
        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
        ;;
    verify_count)
        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
        ;;
    specific_kmer_presence|specific_kmer_count)
        HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
        ;;
    *)
        echo "ERROR: unknown stats type '${TYPE}'" >&2
        exit 1
        ;;
 esac
 # Find most recent existing run CSV (empty string if none).
 latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
 # Check if any .stats file is newer than the latest run CSV.
 if [[ -n "${latest_csv}" ]] && \
   [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
    echo "[${TYPE}] stats up to date (${latest_csv})"
    exit 0
 fi
 run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
 CSV="${STATS_DIR}/run_${run_n}.csv"
 echo "${HEADER}" >"${CSV}"
 # Sort .stats files by name for reproducible row order.
 while IFS= read -r stats_file; do
    sed "s/^/${run_n},/" "${stats_file}"
 done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
 echo "[${TYPE}] run ${run_n} → ${CSV}"
@@ -0,0 +1,137 @@
 #!/usr/bin/env python3
 """Build a reference kmer index from paired-end FASTQ reads.
 Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
 counts their abundances, and saves a sorted numpy pair (kmers, counts).
 Output .npz arrays
  kmers  : uint64, sorted ascending — canonical kmer integers
  counts : uint32, same order      — raw read abundances
 """
 import argparse
 import gzip
 import sys
 from collections import defaultdict
 import numpy as np
 # ── encoding ────────────────────────────────────────────────────────────────
 _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}
 # Lookup table: revcomp of one byte (4 bases, 8 bits).
 # Precomputed once at import time.
 _REVCOMP8 = [0] * 256
 for _i in range(256):
    _rc, _x = 0, _i
    for _ in range(4):
        _rc = (_rc << 2) | (3 - (_x & 3))
        _x >>= 2
    _REVCOMP8[_i] = _rc
 del _i, _rc, _x
 def revcomp_int(kmer: int, k: int) -> int:
    """Reverse-complement of a kmer encoded as an integer (2 bits/base).
    Uses byte-level lookup (4 bases at a time) for speed.
    """
    rc = 0
    bits_left = 2 * k
    while bits_left > 0:
        chunk = min(8, bits_left)
        rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
        rc = (rc << chunk) | rc_byte
        kmer >>= chunk
        bits_left -= chunk
    return rc
 # ── FASTQ parsing ────────────────────────────────────────────────────────────
 def iter_sequences(path: str):
    """Yield raw sequences from a (gzipped) FASTQ file."""
    opener = gzip.open if path.endswith('.gz') else open
    with opener(path, 'rt') as fh:
        while True:
            if not fh.readline():   # '@' header
                break
            seq = fh.readline().rstrip('\n')
            fh.readline()           # '+'
            fh.readline()           # quality
            yield seq
 # ── kmer counting ────────────────────────────────────────────────────────────
 def count_kmers(paths: list[str], k: int) -> dict[int, int]:
    mask = (1 << (2 * k)) - 1
    counts: dict[int, int] = defaultdict(int)
    n_reads = 0
    for path in paths:
        for seq in iter_sequences(path):
            n_reads += 1
            kmer = 0
            run = 0          # consecutive valid bases
            for c in seq:
                b = _ENCODE.get(c)
                if b is None:    # N or unexpected character → reset
                    kmer = 0
                    run = 0
                    continue
                kmer = ((kmer << 2) | b) & mask
                run += 1
                if run >= k:
                    rc = revcomp_int(kmer, k)
                    counts[kmer if kmer <= rc else rc] += 1
            if n_reads % 100_000 == 0:
                print(f'  {n_reads:,} reads processed, '
                      f'{len(counts):,} distinct kmers so far',
                      file=sys.stderr)
    print(f'  {n_reads:,} reads total, {len(counts):,} distinct kmers',
          file=sys.stderr)
    return counts
 # ── main ─────────────────────────────────────────────────────────────────────
 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('reads', nargs='+', metavar='FASTQ',
                    help='Input reads (FASTQ, gzip OK)')
    ap.add_argument('-k', '--kmer-size', type=int, default=31,
                    metavar='K')
    ap.add_argument('--min-abundance', type=int, default=1,
                    metavar='N', help='Drop kmers with count < N (default 1)')
    ap.add_argument('-o', '--output', required=True,
                    metavar='FILE', help='Output .npz path')
    args = ap.parse_args()
    print(f'k={args.kmer_size}  files={len(args.reads)}', file=sys.stderr)
    counts = count_kmers(args.reads, args.kmer_size)
    if args.min_abundance > 1:
        before = len(counts)
        counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
        print(f'  min-abundance={args.min_abundance}: '
              f'{before - len(counts):,} kmers dropped, '
              f'{len(counts):,} retained',
              file=sys.stderr)
    print(f'Sorting and saving → {args.output}', file=sys.stderr)
    kmers_arr  = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
    counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
    np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
    print(f'Done  {len(kmers_arr):,} kmers  →  {args.output}', file=sys.stderr)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,39 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
 REF_DIR="${SCRIPT_DIR}/reference_index"
 PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
 BUILD_PY="${SCRIPT_DIR}/build_reference.py"
 KMER_SIZE="${KMER_SIZE:-31}"
 MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
 mkdir -p "${REF_DIR}"
 for species_dir in "${SIMDATA_DIR}"/*/; do
    [[ -d "${species_dir}" ]] || continue
    species=$(basename "${species_dir}")
    for strain_dir in "${species_dir}"*/; do
        [[ -d "${strain_dir}" ]] || continue
        strain=$(basename "${strain_dir}")
        r1="${strain_dir}/reads_R1.fastq.gz"
        r2="${strain_dir}/reads_R2.fastq.gz"
        if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
            echo "SKIP ${species}--${strain}: reads not found" >&2
            continue
        fi
        out="${REF_DIR}/${species}--${strain}.npz"
        echo "[${species}--${strain}] → ${out}"
        "${PYTHON}" "${BUILD_PY}" \
            --kmer-size      "${KMER_SIZE}" \
            --min-abundance  "${MIN_ABUNDANCE}" \
            --output         "${out}" \
            "${r1}" "${r2}"
    done
 done
@@ -0,0 +1,199 @@
 SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
 SPECIES   := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
 # Escherichia_coli--K-12_MG1655
 simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
 reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
 specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
 specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
 stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
 stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
 # Escherichia_coli--EDL933
 simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
 reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
 specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
 specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
 stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
 stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
 # Salmonella_enterica--LT2
 simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
 reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
 specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
 specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
 stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
 stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
 # Escherichia_coli--CFT073
 simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
 reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
 specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
 specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
 stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
 stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
 # Bacillus_subtilis--168
 simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
 reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
 specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
 specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
 stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
 stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
 # Salmonella_enterica--P125109
 simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
 reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
 specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
 specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
 stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
 stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
 # Shouchella_clausii--KSM-K16
 simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
 reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
 specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
 specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
 stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
 stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
 # Escherichia_coli--K-12_W3110
 simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
 reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
 specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
 specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
 stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
 stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
 # Klebsiella_pneumoniae--MGH_78578
 simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
 reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
 specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
 specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
 stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
 stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
 # Opitutus_terrae--PB90-1
 simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
 reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
 specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
 specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
 stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
 stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
 # Saccharolobus_islandicus--M.16.4
 simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
 reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
 specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
 specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
 stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
 stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
 # Acidobacterium_capsulatum--ATCC_51196
 simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
 reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
 specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
 specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
 stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
 stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
 # Salmonella_enterica--AKU_12601
 simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
 reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
 specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
 specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
 stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
 stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
 # Proteus_mirabilis--HI4320
 simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
 reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
 specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
 specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
 stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
 stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
 # Salmonella_enterica--CT18
 simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
 reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
 specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
 specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
 stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
 stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
 # Klebsiella_pneumoniae--HS11286
 simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
 reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
 specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
 specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
 stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
 stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
 # Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
 simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
 reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
 specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
 specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
 stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
 stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
 # Klebsiella_pneumoniae--ATCC_13883
 simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
 reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
 specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
 specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
 stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
 stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
 # Yersinia_ruckeri--YRB
 simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
 reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
 specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
 specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
 stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
 stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
 # Candidozyma_auris--GCF_003013715.1_ASM301371v2
 simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
 reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
 specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
 specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
 stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
 stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
 # Escherichia_coli
 specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
 specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
 # Salmonella_enterica
 specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
 specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
 # Bacillus_subtilis
 specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
 specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
 # Shouchella_clausii
 specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
 specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
 # Klebsiella_pneumoniae
 specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
 specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
 # Opitutus_terrae
 specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
 specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
 # Saccharolobus_islandicus
 specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
 specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
 # Acidobacterium_capsulatum
 specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
 specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
 # Proteus_mirabilis
 specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
 specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
 # Wolbachia_endosymbiont
 specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
 specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
 # Yersinia_ruckeri
 specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
 specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
 # Candidozyma_auris
 specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
 specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
@@ -0,0 +1,48 @@
 #!/usr/bin/env bash
 set -euo pipefail
 assemblies=(
    GCF_000005845.2
    GCF_000010245.2
    GCF_000007445.1
    GCF_000006665.1
    GCF_000006945.2
    GCF_000195995.1
    GCF_000009505.1
    GCF_000026565.1
    GCF_000016305.1
    GCF_000019965.1
    GCF_000240185.1
    GCF_000742135.1
    GCF_000069965.1
    GCF_000022565.1
    GCF_000306885.1
    GCF_003013715.1
    GCF_000009045.1
    GCF_000009825.1
    GCF_000022445.1
    GCF_000834255.1
 )
 mkdir -p genomes
 for acc in "${assemblies[@]}"; do
    echo "Downloading ${acc}"
    datasets download genome accession "${acc}" \
        --include genome \
        --filename "${acc}.zip"
    unzip -q "${acc}.zip" -d "${acc}"
    find "${acc}" -name "*.fna" |
        while read file; do
            obiconvert -Z ${file} >genomes/$(basename ${file}).gz
        done
    rm -rf "${acc}" "${acc}.zip"
 done
@@ -0,0 +1,108 @@
 #!/usr/bin/env bash
 # Usage: filter_one_count.sh SPECIES
 # Filters global_index_count to keep only kmers specific to SPECIES,
 # then selects the SPECIES column in-place.
 # Outputs:
 #   specific_index_count/SPECIES/index.done  (written by obikmer select)
 #   stats/specific_kmer_count/SPECIES.stats  (one CSV data row, no header)
 set -euo pipefail
 SPECIES="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 SOURCE="${SCRIPT_DIR}/global_index_count"
 OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
 STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
 STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
 mkdir -p "${STATS_DIR}"
 echo "[${SPECIES}] filter (count) → ${OUTPUT}"
 LOG_FILTER=$(mktemp)
 LOG_SELECT=$(mktemp)
 trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
 "${BINARY}" filter \
    --output "${OUTPUT}" \
    --force \
    --ingroup "species=${SPECIES}" \
    --outgroup all \
    --min-frac 0.5 \
    --max-frac 1.0 \
    --max-outgroup-count 0 \
    "${SOURCE}" \
    2>"${LOG_FILTER}"
 cat "${LOG_FILTER}" >&2
 "${BINARY}" select \
    --in-place \
    --group "${SPECIES}:species=${SPECIES}" \
    --group-op "${SPECIES}:any" \
    --select "${SPECIES}" \
    "${OUTPUT}" \
    2>"${LOG_SELECT}"
 cat "${LOG_SELECT}" >&2
 python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
 import sys, re
 species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 def parse_reporter(logfile):
    stats = {}
    state = 'scan'
    with open(logfile, errors='replace') as fh:
        for raw in fh:
            line = strip_ansi(raw.rstrip('\n'))
            s    = line.strip()
            if state == 'scan':
                if re.search(r'\bstage\b.*\bwall\b', line):
                    state = 'in_header'
            elif state == 'in_header':
                if is_sep(s): state = 'rows'
            elif state == 'rows':
                if is_sep(s): state = 'total'
                elif s:
                    parts = re.split(r'  +', s)
                    if len(parts) >= 4:
                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
            elif state == 'total':
                if s:
                    parts = re.split(r'  +', s)
                    if len(parts) >= 3:
                        stats['TOTAL'] = (parse_wall(parts[1]),
                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
                break
    return stats
 f = parse_reporter(log_filter)
 s = parse_reporter(log_select)
 row = [species]
 for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
    key = 'TOTAL' if stage.endswith('_total') else stage
    w, r = d.get(key, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 print(','.join(row))
 PYEOF
@@ -0,0 +1,108 @@
 #!/usr/bin/env bash
 # Usage: filter_one_presence.sh SPECIES
 # Filters global_index_presence to keep only kmers specific to SPECIES,
 # then selects the SPECIES column in-place.
 # Outputs:
 #   specific_index_presence/SPECIES/index.done  (written by obikmer select)
 #   stats/specific_kmer_presence/SPECIES.stats  (one CSV data row, no header)
 set -euo pipefail
 SPECIES="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 SOURCE="${SCRIPT_DIR}/global_index_presence"
 OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
 STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
 STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
 mkdir -p "${STATS_DIR}"
 echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
 LOG_FILTER=$(mktemp)
 LOG_SELECT=$(mktemp)
 trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
 "${BINARY}" filter \
    --output "${OUTPUT}" \
    --force \
    --ingroup "species=${SPECIES}" \
    --outgroup all \
    --min-frac 0.5 \
    --max-frac 1.0 \
    --max-outgroup-count 0 \
    "${SOURCE}" \
    2>"${LOG_FILTER}"
 cat "${LOG_FILTER}" >&2
 "${BINARY}" select \
    --in-place \
    --group "${SPECIES}:species=${SPECIES}" \
    --group-op "${SPECIES}:any" \
    --select "${SPECIES}" \
    "${OUTPUT}" \
    2>"${LOG_SELECT}"
 cat "${LOG_SELECT}" >&2
 python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
 import sys, re
 species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 def parse_reporter(logfile):
    stats = {}
    state = 'scan'
    with open(logfile, errors='replace') as fh:
        for raw in fh:
            line = strip_ansi(raw.rstrip('\n'))
            s    = line.strip()
            if state == 'scan':
                if re.search(r'\bstage\b.*\bwall\b', line):
                    state = 'in_header'
            elif state == 'in_header':
                if is_sep(s): state = 'rows'
            elif state == 'rows':
                if is_sep(s): state = 'total'
                elif s:
                    parts = re.split(r'  +', s)
                    if len(parts) >= 4:
                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
            elif state == 'total':
                if s:
                    parts = re.split(r'  +', s)
                    if len(parts) >= 3:
                        stats['TOTAL'] = (parse_wall(parts[1]),
                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
                break
    return stats
 f = parse_reporter(log_filter)
 s = parse_reporter(log_select)
 row = [species]
 for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
    key = 'TOTAL' if stage.endswith('_total') else stage
    w, r = d.get(key, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 print(','.join(row))
 PYEOF
@@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 # Usage: index_one_count.sh SPECIMEN
 # SPECIMEN = "species--strain" (Make pattern stem)
 # Outputs:
 #   specimen_index_count/SPECIMEN/index.done  (written by obikmer)
 #   stats/indexing_count/SPECIMEN.stats       (one CSV data row, no header)
 set -euo pipefail
 SPECIMEN="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 species="${SPECIMEN%%--*}"
 strain="${SPECIMEN#*--}"
 READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
 INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
 STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
 STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
 mkdir -p "${STATS_DIR}"
 r1="${READS_DIR}/reads_R1.fastq.gz"
 r2="${READS_DIR}/reads_R2.fastq.gz"
 if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
    echo "ERROR: reads not found in ${READS_DIR}" >&2
    exit 1
 fi
 echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
 STDERR_LOG=$(mktemp)
 trap 'rm -f "${STDERR_LOG}"' EXIT
 "${BINARY}" index \
    --output "${INDEX_PATH}" \
    --force \
    --theta 0 \
    --with-counts \
    --label "${SPECIMEN}" \
    --meta  "species=${species}" \
    "${r1}" "${r2}" \
    2>"${STDERR_LOG}"
 cat "${STDERR_LOG}" >&2
 python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
 import sys, re
 species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 stats = {}
 state = 'scan'
 with open(logfile, errors='replace') as fh:
    for raw in fh:
        line = strip_ansi(raw.rstrip('\n'))
        s    = line.strip()
        if state == 'scan':
            if re.search(r'\bstage\b.*\bwall\b', line):
                state = 'in_header'
        elif state == 'in_header':
            if is_sep(s): state = 'rows'
        elif state == 'rows':
            if is_sep(s): state = 'total'
            elif s:
                parts = re.split(r'  +', s)
                if len(parts) >= 4:
                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
        elif state == 'total':
            if s:
                parts = re.split(r'  +', s)
                if len(parts) >= 3:
                    stats[parts[0]] = (parse_wall(parts[1]),
                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
            break
 STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
 row = [species, strain]
 for stage in STAGE_ORDER:
    w, r = stats.get(stage, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 tw, tr = stats.get('TOTAL', ('', ''))
 row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
 print(','.join(row))
 PYEOF
@@ -0,0 +1,102 @@
 #!/usr/bin/env bash
 # Usage: index_one_presence.sh SPECIMEN
 # SPECIMEN = "species--strain" (Make pattern stem)
 # Outputs:
 #   specimen_index_presence/SPECIMEN/index.done  (written by obikmer)
 #   stats/indexing_presence/SPECIMEN.stats       (one CSV data row, no header)
 set -euo pipefail
 SPECIMEN="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 species="${SPECIMEN%%--*}"
 strain="${SPECIMEN#*--}"
 READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
 INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
 STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
 STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
 mkdir -p "${STATS_DIR}"
 r1="${READS_DIR}/reads_R1.fastq.gz"
 r2="${READS_DIR}/reads_R2.fastq.gz"
 if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
    echo "ERROR: reads not found in ${READS_DIR}" >&2
    exit 1
 fi
 echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
 STDERR_LOG=$(mktemp)
 trap 'rm -f "${STDERR_LOG}"' EXIT
 "${BINARY}" index \
    --output "${INDEX_PATH}" \
    --force \
    --theta 0 \
    --label "${SPECIMEN}" \
    --meta  "species=${species}" \
    "${r1}" "${r2}" \
    2>"${STDERR_LOG}"
 cat "${STDERR_LOG}" >&2
 python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
 import sys, re
 species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 stats = {}
 state = 'scan'
 with open(logfile, errors='replace') as fh:
    for raw in fh:
        line = strip_ansi(raw.rstrip('\n'))
        s    = line.strip()
        if state == 'scan':
            if re.search(r'\bstage\b.*\bwall\b', line):
                state = 'in_header'
        elif state == 'in_header':
            if is_sep(s): state = 'rows'
        elif state == 'rows':
            if is_sep(s): state = 'total'
            elif s:
                parts = re.split(r'  +', s)
                if len(parts) >= 4:
                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
        elif state == 'total':
            if s:
                parts = re.split(r'  +', s)
                if len(parts) >= 3:
                    stats[parts[0]] = (parse_wall(parts[1]),
                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
            break
 STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
 row = [species, strain]
 for stage in STAGE_ORDER:
    w, r = stats.get(stage, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 tw, tr = stats.get('TOTAL', ('', ''))
 row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
 print(','.join(row))
 PYEOF
@@ -0,0 +1,118 @@
 #!/usr/bin/env python3
 """Generate deps.mk — pure dependency declarations for the benchmark pipeline.
 Like C .d files: only target: prerequisites lines, no recipes.
 Recipes stay in the Makefile as generic rules.
 """
 import gzip
 import re
 import sys
 from pathlib import Path
 STOP_WORDS    = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
                 'endosymbiont', 'of'}
 STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
 def is_stop(tok):
    t = tok.lower()
    return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
 def sanitize(s):
    return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
 def collect_tokens(text):
    parts = []
    for tok in text.split():
        tok = tok.rstrip(',.')
        if is_stop(tok):
            break
        parts.append(sanitize(tok))
    return '_'.join(filter(None, parts))
 def parse_organism(defn, gcf_id):
    words   = defn.split()
    species = sanitize(words[0] + '_' + words[1])
    m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
    if m:
        strain = sanitize(m.group(1))
        if m.group(2):
            strain += '_' + sanitize(m.group(2))
        return species, strain
    m = re.search(r'\bstrain\b\s+(.*)', defn)
    if m:
        strain = collect_tokens(m.group(1))
        if strain:
            return species, strain
    remainder = re.sub(r'^\S+ \S+\s*', '', defn)
    remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
    remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
    strain    = collect_tokens(remainder)
    return species, strain if strain else gcf_id
 def first_definition(path):
    with gzip.open(path, 'rt') as fh:
        for line in fh:
            if line.startswith('>'):
                m = re.search(r'"definition":"([^"]*)"', line)
                return m.group(1) if m else line[1:].split()[0]
    return Path(path).stem
 def main():
    entries = []   # (specimen, species, sim_dir, genome_path)
    species_seen = []
    for path in sorted(sys.argv[1:]):
        gcf_id  = Path(path).name.replace('_genomic.fna.gz', '')
        defn    = first_definition(path)
        sp, st  = parse_organism(defn, gcf_id)
        specimen = f'{sp}--{st}'
        sim_dir  = f'simulated_data/{sp}/{st}'
        entries.append((specimen, sp, sim_dir, path))
        if sp not in species_seen:
            species_seen.append(sp)
    specimens = [e[0] for e in entries]
    print('SPECIMENS :=', ' '.join(specimens))
    print('SPECIES   :=', ' '.join(species_seen))
    for specimen, species, sim_dir, genome in entries:
        reads = f'{sim_dir}/reads_R1.fastq.gz'
        p_done  = f'specimen_index_presence/{specimen}/index.done'
        p_stats = f'stats/indexing_presence/{specimen}.stats'
        c_done  = f'specimen_index_count/{specimen}/index.done'
        c_stats = f'stats/indexing_count/{specimen}.stats'
        ref     = f'reference_index/{specimen}.npz'
        vp      = f'stats/verify_presence/{specimen}.stats'
        vc      = f'stats/verify_count/{specimen}.stats'
        print()
        print(f'# {specimen}')
        print(f'{reads}: {genome}')
        print(f'{ref}: {reads}')
        print(f'{p_done} {p_stats}: {reads}')
        print(f'{c_done} {c_stats}: {reads}')
        print(f'{vp}: {ref} {p_done}')
        print(f'{vc}: {ref} {c_done}')
    print()
    for sp in species_seen:
        sp_done  = f'specific_index_presence/{sp}/index.done'
        sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
        sc_done  = f'specific_index_count/{sp}/index.done'
        sc_stats = f'stats/specific_kmer_count/{sp}.stats'
        print(f'# {sp}')
        print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
        print(f'{sc_done} {sc_stats}: global_index_count/index.done')
 if __name__ == '__main__':
    main()
@@ -0,0 +1,103 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
 OUTPUT="${SCRIPT_DIR}/global_index_count"
 STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
 mkdir -p "${STATS_DIR}"
 run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
 CSV="${STATS_DIR}/run_${run_n}.csv"
 printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
 parse_reporter() {
    local run="$1" n_sources="$2" logfile="$3"
    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
 import sys, re
 run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 stats = {}
 state = 'scan'
 with open(logfile, errors='replace') as fh:
    for raw in fh:
        line = strip_ansi(raw.rstrip('\n'))
        s    = line.strip()
        if state == 'scan':
            if re.search(r'\bstage\b.*\bwall\b', line):
                state = 'in_header'
        elif state == 'in_header':
            if is_sep(s):
                state = 'rows'
        elif state == 'rows':
            if is_sep(s):
                state = 'total'
            elif s:
                parts = re.split(r'  +', s)
                if len(parts) >= 4:
                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
        elif state == 'total':
            if s:
                parts = re.split(r'  +', s)
                if len(parts) >= 3:
                    stats[parts[0]] = (parse_wall(parts[1]),
                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
            break
 STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
 row = [run, n_sources]
 for stage in STAGE_ORDER:
    w, r = stats.get(stage, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 tw, tr = stats.get('TOTAL', ('', ''))
 row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
 print(','.join(row))
 PYEOF
 }
 mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
 if [[ ${#sources[@]} -eq 0 ]]; then
    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
    exit 1
 fi
 echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
 printf '  %s\n' "${sources[@]}"
 STDERR_LOG=$(mktemp)
 trap 'rm -f "${STDERR_LOG}"' EXIT
 "${BINARY}" merge \
    --output  "${OUTPUT}" \
    --force \
    "${sources[@]}" \
    2>"${STDERR_LOG}"
 cat "${STDERR_LOG}" >&2
 parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
 echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,104 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
 OUTPUT="${SCRIPT_DIR}/global_index_presence"
 STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
 mkdir -p "${STATS_DIR}"
 run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
 CSV="${STATS_DIR}/run_${run_n}.csv"
 printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
 parse_reporter() {
    local run="$1" n_sources="$2" logfile="$3"
    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
 import sys, re
 run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
 def strip_ansi(s):
    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
 def parse_wall(s):
    s = s.strip()
    if s.endswith('ms'): return float(s[:-2]) / 1000.0
    if s.endswith('s'):  return float(s[:-1])
    return 0.0
 def parse_rss(s):
    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
    if not m: return 0
    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
 def is_sep(s):
    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
 stats = {}
 state = 'scan'
 with open(logfile, errors='replace') as fh:
    for raw in fh:
        line = strip_ansi(raw.rstrip('\n'))
        s    = line.strip()
        if state == 'scan':
            if re.search(r'\bstage\b.*\bwall\b', line):
                state = 'in_header'
        elif state == 'in_header':
            if is_sep(s):
                state = 'rows'
        elif state == 'rows':
            if is_sep(s):
                state = 'total'
            elif s:
                parts = re.split(r'  +', s)
                if len(parts) >= 4:
                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
        elif state == 'total':
            if s:
                parts = re.split(r'  +', s)
                if len(parts) >= 3:
                    stats[parts[0]] = (parse_wall(parts[1]),
                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
            break
 STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
 row = [run, n_sources]
 for stage in STAGE_ORDER:
    w, r = stats.get(stage, ('', ''))
    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
 tw, tr = stats.get('TOTAL', ('', ''))
 row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
 print(','.join(row))
 PYEOF
 }
 mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
 if [[ ${#sources[@]} -eq 0 ]]; then
    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
    exit 1
 fi
 echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
 printf '  %s\n' "${sources[@]}"
 STDERR_LOG=$(mktemp)
 trap 'rm -f "${STDERR_LOG}"' EXIT
 "${BINARY}" merge \
    --output          "${OUTPUT}" \
    --force \
    --force-presence \
    "${sources[@]}" \
    2>"${STDERR_LOG}"
 cat "${STDERR_LOG}" >&2
 parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
 echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,12 @@
 #!/usr/bin/env bash
 # Simulate all genomes. Delegates to simulate_one.sh per genome.
 # Prefer running via `gmake simulate` which handles individual dependencies.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
    out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
        --dir-for "${genome_file}")
    bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
 done
@@ -0,0 +1,33 @@
 #!/usr/bin/env bash
 # Usage: simulate_one.sh genome.fna.gz output_dir
 # Simulates paired-end HiSeq reads for a single genome.
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ISS="${SCRIPT_DIR}/../.venv/bin/iss"
 COVERAGE=15
 READ_LENGTH=150
 CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
 genome_file="$1"
 out_dir="$2"
 mkdir -p "${out_dir}"
 tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
 trap 'rm -f "${tmp_fasta}"' EXIT
 gzip -dc "${genome_file}" > "${tmp_fasta}"
 genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
 n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
 echo "[${out_dir}]  genome=${genome_size} bp  →  ${n_reads} read pairs  (${COVERAGE}x HiSeq)"
 "${ISS}" generate \
    --genomes   "${tmp_fasta}" \
    --model     HiSeq \
    --n_reads   "${n_reads}" \
    --cpus      "${CPUS}" \
    --compress \
    --output    "${out_dir}/reads"
@@ -0,0 +1,181 @@
 #!/usr/bin/env python3
 """Compare an obikmer count index against a reference kmer set (presence + counts).
 Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
 streams `obikmer dump` from a --with-counts index, then reports:
  - false negatives : kmers in reference absent from the index
  - false positives : kmers in the index absent from the reference
  - count mismatches: kmers present in both but with differing counts
 Output to stdout: one CSV row
  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
  fn_pct,fp_pct,cm_pct
 """
 import argparse
 import subprocess
 import sys
 import numpy as np
 # ── encoding ──────────────────────────────────────────────────────────────────
 _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}
 _DECODE = ['A', 'C', 'G', 'T']
 def encode_kmer(s: str) -> int:
    kmer = 0
    for c in s:
        kmer = (kmer << 2) | _ENCODE[c]
    return kmer
 def decode_kmer(val: int, k: int) -> str:
    bases = []
    for _ in range(k):
        bases.append(_DECODE[val & 3])
        val >>= 2
    return ''.join(reversed(bases))
 # ── dump parsing ──────────────────────────────────────────────────────────────
 def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
    """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
    cmd = [obikmer_bin, 'dump', index_dir]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
                            text=True)
    kmers, counts = [], []
    header = True
    for line in proc.stdout:
        if header:
            header = False
            continue
        parts = line.rstrip('\n').split(',')
        kmers.append(encode_kmer(parts[0]))
        counts.append(int(parts[1]))
    proc.wait()
    if proc.returncode != 0:
        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
        sys.exit(1)
    order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
    return (np.array(kmers, dtype=np.uint64)[order],
            np.array(counts, dtype=np.uint32)[order])
 # ── comparison ────────────────────────────────────────────────────────────────
 def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
            idx_kmers: np.ndarray, idx_counts: np.ndarray,
            ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
    All arrays sorted; cm_* cover kmers present in both arrays but with
    differing counts.
    """
    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
    # Count mismatches among shared kmers.
    # Both arrays are sorted so we can use searchsorted.
    pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
    pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
    shared_mask = idx_kmers[pos_in_idx] == ref_kmers
    shared_ref_counts = ref_counts[shared_mask]
    shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
    mismatch_mask     = shared_ref_counts != shared_idx_counts
    cm_kmers      = ref_kmers[shared_mask][mismatch_mask]
    cm_ref_counts = shared_ref_counts[mismatch_mask]
    cm_idx_counts = shared_idx_counts[mismatch_mask]
    return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
 # ── main ─────────────────────────────────────────────────────────────────────
 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?',
                    help='Reference .npz file')
    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?',
                    help='obikmer index directory (built with --with-counts)')
    ap.add_argument('--obikmer',  default='obikmer',
                    help='Path to obikmer binary')
    ap.add_argument('--species',  default='')
    ap.add_argument('--strain',   default='')
    ap.add_argument('--header',   action='store_true',
                    help='Print CSV header and exit')
    ap.add_argument('--save-fp',  metavar='FILE',
                    help='Save false-positive kmer strings to FILE')
    ap.add_argument('--save-fn',  metavar='FILE',
                    help='Save false-negative kmer strings to FILE')
    ap.add_argument('--save-cm',  metavar='FILE',
                    help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
    args = ap.parse_args()
    if args.header:
        print('species,strain,ref_kmers,idx_kmers,'
              'false_neg,false_pos,count_mismatch,'
              'fn_pct,fp_pct,cm_pct')
        return
    # Detect k
    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
    k = len(out1.splitlines()[1].split(',')[0])
    # Load reference
    print(f'Loading reference: {args.reference}', file=sys.stderr)
    npz = np.load(args.reference)
    ref_kmers  = npz['kmers']    # sorted uint64
    ref_counts = npz['counts']   # uint32
    # Load index
    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
    idx_kmers, idx_counts = load_index(args.obikmer, args.index)
    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
    false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
        ref_kmers, ref_counts, idx_kmers, idx_counts)
    n_shared  = len(ref_kmers) - len(false_neg)
    fn_pct    = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
    fp_pct    = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
    cm_pct    = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
    print(f'false negatives : {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
    print(f'false positives : {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
    print(f'count mismatches: {len(cm_kmers):,}  ({cm_pct:.4f}% of shared)',
          file=sys.stderr)
    if args.save_fn and len(false_neg):
        with open(args.save_fn, 'w') as fh:
            for v in false_neg:
                fh.write(decode_kmer(int(v), k) + '\n')
    if args.save_fp and len(false_pos):
        with open(args.save_fp, 'w') as fh:
            for v in false_pos:
                fh.write(decode_kmer(int(v), k) + '\n')
    if args.save_cm and len(cm_kmers):
        with open(args.save_cm, 'w') as fh:
            fh.write('kmer,ref_count,idx_count\n')
            for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
                fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
    print(f'{args.species},{args.strain},'
          f'{len(ref_kmers)},{len(idx_kmers)},'
          f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
          f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
 if __name__ == '__main__':
    main()
@@ -0,0 +1,201 @@
 #!/usr/bin/env python3
 """Verify the merged count index against all per-specimen reference sets.
 Streams `obikmer dump` once on the merged index, accumulates per-specimen
 kmer+count pairs from each column, then compares each against its reference .npz.
 Output to stdout: one CSV row per specimen (same columns as verify_count.py)
  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
  fn_pct,fp_pct,cm_pct
 """
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 import numpy as np
 # ── encoding ──────────────────────────────────────────────────────────────────
 _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}
 _DECODE = ['A', 'C', 'G', 'T']
 def encode_kmer(s: str) -> int:
    kmer = 0
    for c in s:
        kmer = (kmer << 2) | _ENCODE[c]
    return kmer
 def decode_kmer(val: int, k: int) -> str:
    bases = []
    for _ in range(k):
        bases.append(_DECODE[val & 3])
        val >>= 2
    return ''.join(reversed(bases))
 # ── single-pass dump ──────────────────────────────────────────────────────────
 def stream_merged_dump(obikmer_bin: str, index_dir: str,
                       ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
    """Stream the merged dump once.
    Returns:
        specimen_names : column labels in dump order
        per_specimen   : mapping label → (kmer_ints, counts) for entries > 0
    """
    cmd = [obikmer_bin, 'dump', index_dir]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
                            text=True)
    header_line = proc.stdout.readline().rstrip('\n')
    cols = header_line.split(',')
    specimen_names = cols[1:]
    per_specimen: dict[str, tuple[list[int], list[int]]] = {
        name: ([], []) for name in specimen_names}
    for line in proc.stdout:
        parts = line.rstrip('\n').split(',')
        kmer_int = encode_kmer(parts[0])
        for i, name in enumerate(specimen_names):
            count = int(parts[i + 1])
            if count > 0:
                per_specimen[name][0].append(kmer_int)
                per_specimen[name][1].append(count)
    proc.wait()
    if proc.returncode != 0:
        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
        sys.exit(1)
    return specimen_names, per_specimen
 # ── per-specimen comparison ───────────────────────────────────────────────────
 def compare_specimen(name: str,
                     kmer_list: list[int],
                     count_list: list[int],
                     ref_dir: Path,
                     k: int,
                     save_fn: Path | None,
                     save_fp: Path | None,
                     save_cm: Path | None,
                     ) -> str:
    ref_path = ref_dir / f'{name}.npz'
    if not ref_path.exists():
        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
        return ''
    species = name.split('--')[0]
    strain  = name[len(species) + 2:]
    npz        = np.load(ref_path)
    ref_kmers  = npz['kmers']    # sorted uint64
    ref_counts = npz['counts']   # uint32
    order      = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
    idx_kmers  = np.array(kmer_list,  dtype=np.uint64)[order]
    idx_counts = np.array(count_list, dtype=np.uint32)[order]
    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
    # Count mismatches among shared kmers
    pos_in_idx     = np.searchsorted(idx_kmers, ref_kmers)
    pos_in_idx     = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
    shared_mask    = idx_kmers[pos_in_idx] == ref_kmers
    mismatch_mask  = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
    cm_kmers       = ref_kmers[shared_mask][mismatch_mask]
    cm_ref         = ref_counts[shared_mask][mismatch_mask]
    cm_idx         = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
    n_shared = int(shared_mask.sum())
    fn_pct   = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
    fp_pct   = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
    cm_pct   = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
          f'fp={len(false_pos):,} ({fp_pct:.4f}%)  '
          f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
          file=sys.stderr)
    if save_fn and len(false_neg):
        fn_file = save_fn / f'{name}_fn.txt'
        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
    if save_fp and len(false_pos):
        fp_file = save_fp / f'{name}_fp.txt'
        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
    if save_cm and len(cm_kmers):
        cm_file = save_cm / f'{name}_cm.csv'
        lines = ['kmer,ref_count,idx_count']
        for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
            lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
        cm_file.write_text('\n'.join(lines) + '\n')
    return (f'{species},{strain},'
            f'{len(ref_kmers)},{len(idx_kmers)},'
            f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
            f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
 # ── main ─────────────────────────────────────────────────────────────────────
 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
                    help='Merged count index directory')
    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
                    help='Directory containing per-specimen .npz reference files')
    ap.add_argument('--obikmer', default='obikmer')
    ap.add_argument('--header',  action='store_true',
                    help='Print CSV header and exit')
    ap.add_argument('--save-fn', metavar='DIR',
                    help='Directory for false-negative kmer lists')
    ap.add_argument('--save-fp', metavar='DIR',
                    help='Directory for false-positive kmer lists')
    ap.add_argument('--save-cm', metavar='DIR',
                    help='Directory for count-mismatch CSV files')
    args = ap.parse_args()
    if args.header:
        print('species,strain,ref_kmers,idx_kmers,'
              'false_neg,false_pos,count_mismatch,'
              'fn_pct,fp_pct,cm_pct')
        return
    ref_dir = Path(args.ref_dir)
    save_fn = Path(args.save_fn) if args.save_fn else None
    save_fp = Path(args.save_fp) if args.save_fp else None
    save_cm = Path(args.save_cm) if args.save_cm else None
    for d in (save_fn, save_fp, save_cm):
        if d: d.mkdir(parents=True, exist_ok=True)
    out1 = subprocess.check_output(
        [args.obikmer, 'dump', '--head', '1', args.index],
        stderr=subprocess.DEVNULL, text=True)
    k = len(out1.splitlines()[1].split(',')[0])
    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
    for name in specimen_names:
        kmers, counts = per_specimen[name]
        row = compare_specimen(name, kmers, counts, ref_dir, k,
                               save_fn, save_fp, save_cm)
        if row:
            print(row)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,27 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 INDEX="${SCRIPT_DIR}/global_index_count"
 REF_DIR="${SCRIPT_DIR}/reference_index"
 STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
 PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
 VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
 mkdir -p "${STATS_DIR}"
 CURRENT="${STATS_DIR}/current.csv"
 "${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
 "${PYTHON}" "${VERIFY_PY}" \
    --obikmer "${BINARY}" \
    "${INDEX}" "${REF_DIR}" \
    >>"${CURRENT}"
 run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
 ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
 cp "${CURRENT}" "${ARCHIVE}"
 echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,170 @@
 #!/usr/bin/env python3
 """Verify the merged presence index against all per-specimen reference sets.
 Streams `obikmer dump` once on the merged index, accumulates per-specimen
 kmer sets from each column, then compares each against its reference .npz.
 Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
 """
 import argparse
 import subprocess
 import sys
 from pathlib import Path
 import numpy as np
 # ── encoding ──────────────────────────────────────────────────────────────────
 _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}
 _DECODE = ['A', 'C', 'G', 'T']
 def encode_kmer(s: str) -> int:
    kmer = 0
    for c in s:
        kmer = (kmer << 2) | _ENCODE[c]
    return kmer
 def decode_kmer(val: int, k: int) -> str:
    bases = []
    for _ in range(k):
        bases.append(_DECODE[val & 3])
        val >>= 2
    return ''.join(reversed(bases))
 # ── single-pass dump ──────────────────────────────────────────────────────────
 def stream_merged_dump(obikmer_bin: str, index_dir: str,
                       ) -> tuple[list[str], dict[str, list[int]]]:
    """Stream the merged dump once.
    Returns:
        specimen_names : column labels in dump order (excluding 'kmer')
        per_specimen   : mapping label → list of kmer ints where presence > 0
    """
    cmd = [obikmer_bin, 'dump', index_dir]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
                            text=True)
    header_line = proc.stdout.readline().rstrip('\n')
    cols = header_line.split(',')
    specimen_names = cols[1:]           # first col is 'kmer'
    per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
    for line in proc.stdout:
        parts = line.rstrip('\n').split(',')
        kmer_int = encode_kmer(parts[0])
        for i, name in enumerate(specimen_names):
            if int(parts[i + 1]) > 0:
                per_specimen[name].append(kmer_int)
    proc.wait()
    if proc.returncode != 0:
        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
        sys.exit(1)
    return specimen_names, per_specimen
 # ── per-specimen comparison ───────────────────────────────────────────────────
 def compare_specimen(name: str,
                     kmer_list: list[int],
                     ref_dir: Path,
                     k: int,
                     save_fn: Path | None,
                     save_fp: Path | None,
                     ) -> str:
    """Compare one specimen column against its reference .npz.
    Returns a CSV row string.
    """
    ref_path = ref_dir / f'{name}.npz'
    if not ref_path.exists():
        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
        return ''
    species = name.split('--')[0]
    strain  = name[len(species) + 2:]
    ref_kmers = np.load(ref_path)['kmers']          # sorted uint64
    idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
          f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
          file=sys.stderr)
    if save_fn and len(false_neg):
        fn_file = save_fn / f'{name}_fn.txt'
        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
    if save_fp and len(false_pos):
        fp_file = save_fp / f'{name}_fp.txt'
        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
    return (f'{species},{strain},'
            f'{len(ref_kmers)},{len(idx_kmers)},'
            f'{len(false_neg)},{len(false_pos)},'
            f'{fn_pct:.4f},{fp_pct:.4f}')
 # ── main ─────────────────────────────────────────────────────────────────────
 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
                    help='Merged presence index directory')
    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
                    help='Directory containing per-specimen .npz reference files')
    ap.add_argument('--obikmer', default='obikmer')
    ap.add_argument('--header',  action='store_true',
                    help='Print CSV header and exit')
    ap.add_argument('--save-fn', metavar='DIR',
                    help='Directory to save false-negative kmer lists')
    ap.add_argument('--save-fp', metavar='DIR',
                    help='Directory to save false-positive kmer lists')
    args = ap.parse_args()
    if args.header:
        print('species,strain,ref_kmers,idx_kmers,'
              'false_neg,false_pos,fn_pct,fp_pct')
        return
    ref_dir  = Path(args.ref_dir)
    save_fn  = Path(args.save_fn) if args.save_fn else None
    save_fp  = Path(args.save_fp) if args.save_fp else None
    if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
    if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
    # Detect k
    out1 = subprocess.check_output(
        [args.obikmer, 'dump', '--head', '1', args.index],
        stderr=subprocess.DEVNULL, text=True)
    k = len(out1.splitlines()[1].split(',')[0])
    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
    for name in specimen_names:
        row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
        if row:
            print(row)
 if __name__ == '__main__':
    main()
@@ -0,0 +1,27 @@
 #!/usr/bin/env bash
 set -euo pipefail
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 INDEX="${SCRIPT_DIR}/global_index_presence"
 REF_DIR="${SCRIPT_DIR}/reference_index"
 STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
 PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
 VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
 mkdir -p "${STATS_DIR}"
 CURRENT="${STATS_DIR}/current.csv"
 "${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
 "${PYTHON}" "${VERIFY_PY}" \
    --obikmer "${BINARY}" \
    "${INDEX}" "${REF_DIR}" \
    >>"${CURRENT}"
 run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
 ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
 cp "${CURRENT}" "${ARCHIVE}"
 echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,30 @@
 #!/usr/bin/env bash
 # Usage: verify_one_count.sh SPECIMEN
 # SPECIMEN = "species--strain" (Make pattern stem)
 # Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
 set -euo pipefail
 SPECIMEN="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
 VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
 species="${SPECIMEN%%--*}"
 strain="${SPECIMEN#*--}"
 REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
 INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
 STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
 STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
 mkdir -p "${STATS_DIR}"
 echo "[${SPECIMEN}] verifying count"
 "${PYTHON}" "${VERIFY_PY}" \
    --obikmer "${BINARY}" \
    --species "${species}" \
    --strain  "${strain}" \
    "${REF_NPZ}" "${INDEX_DIR}" \
    >"${STATS_FILE}"
@@ -0,0 +1,30 @@
 #!/usr/bin/env bash
 # Usage: verify_one_presence.sh SPECIMEN
 # SPECIMEN = "species--strain" (Make pattern stem)
 # Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
 set -euo pipefail
 SPECIMEN="$1"
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
 PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
 VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
 species="${SPECIMEN%%--*}"
 strain="${SPECIMEN#*--}"
 REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
 INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
 STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
 STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
 mkdir -p "${STATS_DIR}"
 echo "[${SPECIMEN}] verifying presence"
 "${PYTHON}" "${VERIFY_PY}" \
    --obikmer "${BINARY}" \
    --species "${species}" \
    --strain  "${strain}" \
    "${REF_NPZ}" "${INDEX_DIR}" \
    >"${STATS_FILE}"
@@ -0,0 +1,139 @@
 #!/usr/bin/env python3
 """Compare an obikmer index against a reference kmer set (presence/absence).
 Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
 streams the output of `obikmer dump`, encodes each kmer string to uint64,
 then reports false negatives and false positives using numpy set operations.
 Output to stdout: one CSV row
  species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
 """
 import argparse
 import subprocess
 import sys
 import numpy as np
 # ── encoding ──────────────────────────────────────────────────────────────────
 _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}
 _DECODE = ['A', 'C', 'G', 'T']
 def encode_kmer(s: str) -> int:
    kmer = 0
    for c in s:
        kmer = (kmer << 2) | _ENCODE[c]
    return kmer
 def decode_kmer(val: int, k: int) -> str:
    bases = []
    for _ in range(k):
        bases.append(_DECODE[val & 3])
        val >>= 2
    return ''.join(reversed(bases))
 # ── dump parsing ──────────────────────────────────────────────────────────────
 def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
    """Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
    cmd = [obikmer_bin, 'dump', index_dir]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
                            text=True)
    kmers = []
    header = True
    for line in proc.stdout:
        if header:
            header = False
            continue
        kmer_str = line.split(',', 1)[0]
        kmers.append(encode_kmer(kmer_str))
    proc.wait()
    if proc.returncode != 0:
        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
        sys.exit(1)
    arr = np.array(kmers, dtype=np.uint64)
    arr.sort()
    return arr
 # ── comparison ────────────────────────────────────────────────────────────────
 def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Return (false_negatives, false_positives) as uint64 arrays."""
    false_neg = np.setdiff1d(ref, idx, assume_unique=True)
    false_pos = np.setdiff1d(idx, ref, assume_unique=True)
    return false_neg, false_pos
 # ── main ─────────────────────────────────────────────────────────────────────
 def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?', help='Reference .npz file')
    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
    ap.add_argument('--obikmer',  default='obikmer',   help='Path to obikmer binary')
    ap.add_argument('--species',  default='',          help='Species label for CSV row')
    ap.add_argument('--strain',   default='',          help='Strain label for CSV row')
    ap.add_argument('--header',   action='store_true', help='Print CSV header and exit')
    ap.add_argument('--save-fp',  metavar='FILE',
                    help='Save false-positive kmer strings to FILE')
    ap.add_argument('--save-fn',  metavar='FILE',
                    help='Save false-negative kmer strings to FILE')
    args = ap.parse_args()
    if args.header:
        print('species,strain,ref_kmers,idx_kmers,'
              'false_neg,false_pos,fn_pct,fp_pct')
        return
    # Detect k from the index (one cheap call before the full dump).
    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
    k = len(out1.splitlines()[1].split(',')[0])
    # Load reference
    print(f'Loading reference: {args.reference}', file=sys.stderr)
    npz = np.load(args.reference)
    ref_kmers = npz['kmers']          # already sorted uint64
    # Load index
    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
    idx_kmers = load_index_kmers(args.obikmer, args.index)
    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
    false_neg, false_pos = compare(ref_kmers, idx_kmers)
    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
    print(f'false negatives: {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
    print(f'false positives: {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
    if args.save_fn and len(false_neg):
        with open(args.save_fn, 'w') as fh:
            for v in false_neg:
                fh.write(decode_kmer(int(v), k) + '\n')
        print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
    if args.save_fp and len(false_pos):
        with open(args.save_fp, 'w') as fh:
            for v in false_pos:
                fh.write(decode_kmer(int(v), k) + '\n')
        print(f'False positives saved → {args.save_fp}', file=sys.stderr)
    print(f'{args.species},{args.strain},'
          f'{len(ref_kmers)},{len(idx_kmers)},'
          f'{len(false_neg)},{len(false_pos)},'
          f'{fn_pct:.4f},{fp_pct:.4f}')
 if __name__ == '__main__':
    main()
@@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate.
 ### Path matching (`~` and `!~`)
-Metadata values can represent hierarchical taxonomic paths such as
+Metadata values can represent hierarchical concept paths such as
 `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
- **Absolute pattern** (starts with `/`): the value must start with the pattern
+**Both the stored metadata value and the pattern must start with `/`.**
-  at a segment boundary.
+A pattern that does not start with `/` is rejected at parse time with an error.
-  `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
+
 The value matches the pattern if it equals it exactly or starts with the pattern
 followed by `/` (segment-boundary prefix):
 - `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
  `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
 - **Bare segment** (no leading `/`): the value must contain the pattern as an
  exact path component anywhere.
  `taxon~Betula` matches any path that has `Betula` as one of its segments.
 ### Missing metadata key → NA
@@ -53,6 +53,7 @@ nav:
      - Merge parallelism & memory: implementation/merge_parallelism.md
      - Kmer filtering: implementation/filtering.md
      - Select command: implementation/select.md
      - obitaxonomy crate: implementation/obitaxonomy.md
  - Architecture:
      - Sequences: architecture/sequences/invariant.md
      - Kmer index: architecture/index_architecture.md
@@ -1853,6 +1853,10 @@ dependencies = [
 "tracing",
 ]
 [[package]]
 name = "obitaxonomy"
 version = "0.1.0"
 [[package]]
 name = "object"
 version = "0.37.3"
@@ -1,5 +1,5 @@
 [workspace]
 resolver = "3"
-members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
+members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
 [profile.release]
 debug = 1
@@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
 // ── BitIter ───────────────────────────────────────────────────────────────────
 pub struct BitIter<'a> {
-    pub(crate) words: &'a [u64],
+    words: &'a [u64],
-    pub(crate) slot:  usize,
+    slot:  usize,
-    pub(crate) n:     usize,
+    n:     usize,
 }
 impl ExactSizeIterator for BitIter<'_> {}
@@ -132,7 +132,7 @@ impl PersistentBitVecBuilder {
        Ok(Self { mmap, n, path: path.to_path_buf() })
    }
-    pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
+    pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
        let file_size = HEADER_SIZE + n_bytes_for_words(n);
        let file = OpenOptions::new()
            .read(true).write(true).create(true).truncate(true)
@@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder;
 pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use reader::PersistentCompactIntVec;
+pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
-pub use tempbitvec::TempBitVec;
+pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
-pub use tempintvec::TempCompactIntVec;
+pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 pub use traits::{BitPartials, ColumnWeights, CountPartials};
-pub use views::{BitSliceView, IntSliceView};
+pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
 #[cfg(test)]
 #[path = "tests/mod.rs"]
@@ -43,27 +43,27 @@ impl TempBitVec {
 // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
-pub(crate) struct TempBitVecBuilder {
+pub struct TempBitVecBuilder {
    builder: PersistentBitVecBuilder,
    temp: TempDir,
 }
 impl TempBitVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pbiv");
        let builder = PersistentBitVecBuilder::new(n, &path)?;
        Ok(Self { builder, temp })
    }
-    pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
+    pub fn new_ones(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pbiv");
        let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
        Ok(Self { builder, temp })
    }
-    pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
+    pub fn freeze(self) -> io::Result<TempBitVec> {
        let Self { builder, temp } = self;
        let vec = builder.finish()?;
        Ok(TempBitVec { vec, _temp: temp })
@@ -72,7 +72,8 @@ impl TempBitVecBuilder {
    pub fn set(&mut self, slot: usize, value: bool) {
        self.builder.set(slot, value);
    }
-    pub(crate) fn view(&self) -> BitSliceView<'_> {
+
    pub fn view(&self) -> BitSliceView<'_> {
        self.builder.view()
    }
@@ -80,19 +81,19 @@ impl TempBitVecBuilder {
        self.builder.or(other);
    }
-    pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
+    pub fn and(&mut self, other: BitSliceView<'_>) {
        self.builder.and(other);
    }
-    pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
        self.builder.xor(other);
    }
-    pub(crate) fn not(&mut self) {
+    pub fn not(&mut self) {
        self.builder.not();
    }
-    pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
        self.builder.copy_from(src);
    }
@@ -100,11 +101,11 @@ impl TempBitVecBuilder {
        self.builder.or_where(col, pred);
    }
-    pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.and_where(col, pred);
    }
-    pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.xor_where(col, pred);
    }
 }
@@ -32,60 +32,58 @@ impl TempCompactIntVec {
 // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
-pub(crate) struct TempCompactIntVecBuilder {
+pub struct TempCompactIntVecBuilder {
    builder: PersistentCompactIntVecBuilder,
    temp:    TempDir,
 }
 impl TempCompactIntVecBuilder {
-    pub(crate) fn new(n: usize) -> io::Result<Self> {
+    pub fn new(n: usize) -> io::Result<Self> {
        let temp = TempDir::new()?;
        let path = temp.path().join("data.pciv");
        let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
        Ok(Self { builder, temp })
    }
-    pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
+    pub fn freeze(self) -> io::Result<TempCompactIntVec> {
        let Self { builder, temp } = self;
        let vec = builder.finish()?;
        Ok(TempCompactIntVec { vec, _temp: temp })
    }
-    // ── Delegation methods ────────────────────────────────────────────────────
+    pub fn n(&self) -> usize { self.builder.len() }
-    pub(crate) fn n(&self) -> usize { self.builder.len() }
+    pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
    pub fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
-    pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    pub fn primary_bytes(&self)         -> &[u8]      { self.builder.primary_bytes() }
-    pub(crate) fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8]  { self.builder.primary_bytes_mut() }
-    pub(crate) fn primary_bytes(&self)       -> &[u8]      { self.builder.primary_bytes() }
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
    pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
    pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
        self.builder.inc_present(col);
    }
-    pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
        self.builder.inc_present_fast(col);
    }
-    pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.inc_predicate(col, pred);
    }
-    pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
        self.builder.inc_predicate_fast(col, pred);
    }
-    pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
+    pub fn add(&mut self, other: IntSliceView<'_>) {
        self.builder.add(other);
    }
-    pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
        self.builder.mask_with(mask);
    }
-    pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
+    pub fn min(&mut self, other: IntSliceView<'_>)  { self.builder.min(other); }
-    pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
+    pub fn max(&mut self, other: IntSliceView<'_>)  { self.builder.max(other); }
-    pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
+    pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
 }
@@ -3,6 +3,7 @@ use crossbeam_channel;
 use hashbrown::HashMap;
 use obikseq::k;
 use obikseq::{CanonicalKmer, Sequence, Unitig};
 #[cfg(not(any(test, feature = "test-utils")))]
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
 use std::cell::RefCell;
 use std::fmt;
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
 use crate::error::{OKIError, OKIResult};
 use crate::index::KmerIndex;
 use crate::meta::{GenomeInfo, IndexMeta};
-use crate::state::IndexState;
+use crate::state::{IndexState, SENTINEL_INDEXED};
 pub use obikpartitionner::MergeMode;
@@ -263,6 +263,8 @@ impl KmerIndex {
            rep.push(t.stop());
        }
        fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
        KmerIndex::open(output)
    }
 }
@@ -49,6 +49,11 @@ impl MetaPred {
        if values.iter().any(|v| v.is_empty()) {
            return Err(format!("empty value in predicate: {s}"));
        }
        if matches!(op, PredOp::Matches | PredOp::NotMatches) {
            if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
                return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
            }
        }
        Ok(Self { key, op, values })
    }
@@ -72,16 +77,12 @@ impl MetaPred {
 /// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
 ///
-/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
+/// Both `value` and `pattern` must start with `/`.
-/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
+/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
 fn path_matches(value: &str, pattern: &str) -> bool {
-    if pattern.starts_with('/') {
+    value == pattern
-        value == pattern
+        || (value.starts_with(pattern)
-            || (value.starts_with(pattern)
+            && value[pattern.len()..].starts_with('/'))
                && value[pattern.len()..].starts_with('/'))
    } else {
        value.split('/').any(|seg| seg == pattern)
    }
 }
 // ── Three-value group evaluation ──────────────────────────────────────────────
@@ -1,28 +0,0 @@
 >F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
 GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
 >C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
 AAAAAAATTAAGTCCCATATTGAAACATTAT
 >9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
 AAAAAATTAAGTCCCATATTGAAACATTATC
 >74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
 AAAAATTAAGTCCCATATTGAAACATTATCA
 >45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
 AAAATTAAGTCCCATATTGAAACATTATCAC
 >5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
 AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
 >8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
 AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
 >C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
 AATGTGAGTTGTTAATATTACATAATTGGGTT
 >933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
 TGTGAGTTGTTAATATTACATAATTGGGTTT
 >3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
 GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
 >1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
 TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
 >8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
 AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
 >247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
 CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
 >11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
 GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG