chore: bump obikmer to 1.1.13 and fix Makefile revision tag

Update the obikmer crate version from 1.1.12 to 1.1.13 in Cargo.toml. Additionally, change the Makefile's Git revision specifier from @- to @ to ensure the version tag is applied to the current commit before pushing.
Merge pull request 'chore: bump version to 1.1.12 and fix release workflow' (#40 ) from push-zmkxouxypspm into main
2026-06-22 16:17:52 +02:00 · 2026-06-22 14:13:13 +00:00 · 2026-06-22 16:12:56 +02:00 · 2026-06-22 13:52:51 +00:00 · 2026-06-22 15:52:34 +02:00 · 2026-06-22 13:32:50 +00:00
120 changed files with 10274 additions and 1736 deletions
@@ -0,0 +1,36 @@
+name: CI
+
+on:
+  push:
+    branches: ['**']
+  pull_request:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: src
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+
+      - name: Cache cargo registry
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            src/target
+          key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }}
+          restore-keys: ${{ runner.os }}-cargo-
+
+      - name: Build
+        run: cargo build --release
+
+      - name: Test
+        run: cargo test --release
@@ -0,0 +1,65 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - "v*"
+
+jobs:
+  build-linux-static:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: src
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust + zigbuild
+        run: |
+          curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
+          echo "$HOME/.cargo/bin" >> $GITHUB_PATH
+          sudo apt-get update -qq && sudo apt-get install -y -qq jq
+          pip install ziglang --quiet --break-system-packages
+          $HOME/.cargo/bin/cargo install cargo-zigbuild
+          $HOME/.cargo/bin/rustup target add x86_64-unknown-linux-musl
+
+      - name: Create musl C/C++ wrappers
+        run: |
+          ZIG=$(python3 -c "import ziglang, os; print(os.path.join(os.path.dirname(ziglang.__file__), 'zig'))")
+          printf '#!/bin/sh\nexec "%s" cc -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-gcc > /dev/null
+          printf '#!/bin/sh\nexec "%s" c++ -target x86_64-linux-musl "$@"\n' "$ZIG" | sudo tee /usr/local/bin/x86_64-linux-musl-g++ > /dev/null
+          sudo chmod +x /usr/local/bin/x86_64-linux-musl-gcc /usr/local/bin/x86_64-linux-musl-g++
+
+      - name: Cache cargo registry
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+            src/target
+          key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }}
+          restore-keys: linux-musl-cargo-
+
+      - name: Build static binary
+        run: cargo zigbuild --release --target x86_64-unknown-linux-musl
+
+      - name: Prepare artifact
+        run: |
+          mkdir -p /tmp/dist
+          cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64
+          strip /tmp/dist/obikmer-linux-x86_64
+
+      - name: Create Gitea release and upload binary
+        env:
+          GITEA_TOKEN: ${{ secrets.GITEATOKEN }}
+          TAG: ${{ github.ref_name }}
+        run: |
+          release_id=$(curl -s -X POST \
+            "${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases" \
+            -H "Authorization: token $GITEA_TOKEN" \
+            -H "Content-Type: application/json" \
+            -d "{\"tag_name\":\"$TAG\",\"name\":\"$TAG\"}" | jq -r '.id')
+          curl -s -X POST \
+            "${{ github.server_url }}/api/v1/repos/${{ github.repository }}/releases/$release_id/assets" \
+            -H "Authorization: token $GITEA_TOKEN" \
+            -F "attachment=@/tmp/dist/obikmer-linux-x86_64"
@@ -9,3 +9,13 @@ data-stress
 ./**/*.json
 *.bin
 Betula_exilis--IGA-24-33
+benchmark/genomes
+benchmark/simulated_data
+benchmark/specimen_index_presence
+benchmark/specimen_index_count
+benchmark/global_index_presence
+benchmark/global_index_count
+benchmark/stats
+benchmark/reference_index
+benchmark/specific_index_count
+benchmark/specific_index_presence
@@ -0,0 +1,2 @@
+/cache
+/project.local.yml
@@ -0,0 +1,133 @@
+# the name by which the project can be referenced within Serena
+project_name: "obikmer"
+
+
+# list of languages for which language servers are started; choose from:
+#   al                  angular             ansible             bash                clojure
+#   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
+#   dart                elixir              elm                 erlang              fortran
+#   fsharp              go                  groovy              haskell             haxe
+#   hlsl                html                java                json                julia
+#   kotlin              lean4               lua                 luau                markdown
+#   matlab              msl                 nix                 ocaml               pascal
+#   perl                php                 php_phpactor        powershell          python
+#   python_jedi         python_ty           r                   rego                ruby
+#   ruby_solargraph     rust                scala               scss                solidity
+#   svelte              swift               systemverilog       terraform           toml
+#   typescript          typescript_vts      vue                 yaml                zig
+#   (This list may be outdated. For the current list, see values of Language enum here:
+#   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
+#   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
+# Note:
+#   - For C, use cpp
+#   - For JavaScript, use typescript
+#   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
+#   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
+#   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
+#   - For Free Pascal/Lazarus, use pascal
+# Special requirements:
+#   Some languages require additional setup/installations.
+#   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
+# When using multiple languages, the first language server that supports a given file will be used for that file.
+# The first language is the default language and the respective language server will be used as a fallback.
+# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
+languages:
+- rust
+
+# the encoding used by text files in the project
+# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
+encoding: "utf-8"
+
+# line ending convention to use when writing source files.
+# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
+# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
+line_ending:
+
+# The language backend to use for this project.
+# If not set, the global setting from serena_config.yml is used.
+# Valid values: LSP, JetBrains
+# Note: the backend is fixed at startup. If a project with a different backend
+# is activated post-init, an error will be returned.
+language_backend:
+
+# whether to use project's .gitignore files to ignore files
+ignore_all_files_in_gitignore: true
+
+# advanced configuration option allowing to configure language server-specific options.
+# Maps the language key to the options.
+# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
+# No documentation on options means no options are available.
+ls_specific_settings: {}
+
+# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
+# Paths can be absolute or relative to the project root.
+# Each folder is registered as an LSP workspace folder, enabling language servers to discover
+# symbols and references across package boundaries.
+# Currently supported for: TypeScript.
+# Example:
+#   additional_workspace_folders:
+#     - ../sibling-package
+#     - ../shared-lib
+additional_workspace_folders: []
+
+# list of additional paths to ignore in this project.
+# Same syntax as gitignore, so you can use * and **.
+# Note: global ignored_paths from serena_config.yml are also applied additively.
+ignored_paths: []
+
+# whether the project is in read-only mode
+# If set to true, all editing tools will be disabled and attempts to use them will result in an error
+# Added on 2025-04-18
+read_only: false
+
+# list of tool names to exclude.
+# This extends the existing exclusions (e.g. from the global configuration)
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+excluded_tools: []
+
+# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
+# This extends the existing inclusions (e.g. from the global configuration).
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+included_optional_tools: []
+
+# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
+# This cannot be combined with non-empty excluded_tools or included_optional_tools.
+# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
+fixed_tools: []
+
+# list of mode names that are to be activated by default, overriding the setting in the global configuration.
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
+# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
+# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
+# for this project.
+# This setting can, in turn, be overridden by CLI parameters (--mode).
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+default_modes:
+
+# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
+# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
+# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
+added_modes:
+
+# initial prompt for the project. It will always be given to the LLM upon activating the project
+# (contrary to the memories, which are loaded on demand).
+initial_prompt: ""
+
+# time budget (seconds) per tool call for the retrieval of additional symbol information
+# such as docstrings or parameter information.
+# This overrides the corresponding setting in the global configuration; see the documentation there.
+# If null or missing, use the setting from the global configuration.
+symbol_info_budget:
+
+# list of regex patterns which, when matched, mark a memory entry as read‑only.
+# Extends the list from the global configuration, merging the two lists.
+read_only_memory_patterns: []
+
+# list of regex patterns for memories to completely ignore.
+# Matching memories will not appear in list_memories or activate_project output
+# and cannot be accessed via read_memory or write_memory.
+# To access ignored memory files, use the read_file tool on the raw file path.
+# Extends the list from the global configuration, merging the two lists.
+# Example: ["_archive/.*", "_episodes/.*"]
+ignored_memory_patterns: []
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
 ---

 Je continue à poser mes questions et à guider la discussion.
+
+---
+
+## MCP Tools
+
+**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
+
+### Hiérarchie des outils pour ce projet Rust
+
+**Navigation et édition de code → serena en priorité**
+- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
+- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
+- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
+- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
+- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
+- Ne pas utiliser `cclsp` quand serena couvre le besoin
+
+**Analyse architecturale → jcodemunch**
+- Hotspots, couplage, dead code, dépendances entre modules
+- Utiliser avant de refactorer une zone critique
+
+**Raisonnement complexe → sequential-thinking**
+- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
+
+**Documentation de crates → context7**
+- Toujours consulter avant d'utiliser une API de bibliothèque externe
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
 		mkdocs mkdocs-material \
 		mkdocs-mermaid2-plugin \
 		mkdocs-bibtex
+	$(PIP) install --quiet --upgrade InSilicoSeq

 # ── obikmer binary ───────────────────────────────────────────────────────────

@@ -62,3 +63,32 @@ clean-doc:
 .PHONY: clean
 clean: clean-doc
 	rm -rf $(VENV)
+
+# ── release ───────────────────────────────────────────────────────────────────
+
+CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml
+
+.PHONY: bump-version
+bump-version:
+	@current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
+	if [ -n "$(RELEASE)" ]; then \
+		new_version="$(RELEASE)"; \
+	else \
+		major=$$(echo $$current | cut -d. -f1); \
+		minor=$$(echo $$current | cut -d. -f2); \
+		patch=$$(echo $$current | cut -d. -f3); \
+		new_patch=$$((patch + 1)); \
+		new_version="$$major.$$minor.$$new_patch"; \
+	fi; \
+	echo "Version: $$current -> $$new_version"; \
+	sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \
+	rm $(CARGO_TOML).bak
+
+.PHONY: release
+release: bump-version
+	@jj auto-describe
+	@jj git push --change @
+	@new_version=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \
+	git_hash=$$(jj log -r @ --no-graph -T 'commit_id'); \
+	git tag "v$$new_version" "$$git_hash" && \
+	git push origin "v$$new_version"
@@ -51,7 +51,13 @@ Non-ACGT characters act as hard breaks between k-mer segments in all formats.
               Runs scatter → dereplicate → count → layered MPHF.
               Resumes automatically if interrupted.
    merge      Merge multiple independently built indexes into one.
-    rebuild    Filter and compact an existing index: apply count thresholds,
+               Schedules partitions largest-first under a memory budget semaphore
+               to avoid OOM on machines with many cores. The worst partition runs
+               alone first to calibrate the expansion estimator; subsequent
+               partitions run in parallel within the budget.
+               --budget-fraction F  fraction of available RAM to use as budget
+                                    (default 0.5; reduce if OOM persists).
+    filter     Filter and compact an existing index: apply count thresholds,
               drop layers, rewrite as a single-layer index.
    reindex    Convert evidence in-place across all layers:
               exact (evidence.bin) ↔ approximate (fingerprint.bin).
@@ -74,7 +80,14 @@ Non-ACGT characters act as hard breaks between k-mer segments in all formats.
               Diagnostic / pipeline use.
    unitig     Dump the unitig sequences stored in a built index. Debug use.
    utils      Miscellaneous utilities.
-               --new-label NEW=OLD  renames a genome label in-place.
+               --new-label NEW=OLD      rename a genome label in-place.
+               --bits-per-kmer          print MPHF / evidence / matrix size breakdown.
+               --stats                  per-genome k-mer counts as CSV.
+               --partition-stats        partition size distribution across one or more
+                                        indexes (markdown report to stdout). Useful to
+                                        diagnose minimizer imbalance before a large merge.
+               --csv FILE               write per-(partition, source) raw data to FILE
+                                        (used with --partition-stats).

 ## Quick start

@@ -0,0 +1,144 @@
+# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
+BINARY  := ../src/target/release/obikmer
+VENV_PY := ../.venv/bin/python3
+
+GENOMES := $(wildcard genomes/*.fna.gz)
+
+# SPECIMENS, SPECIES, and the full dependency graph are generated by
+# make_deps.py from the genome FASTA headers — like .d files in C.
+# Make rebuilds deps.mk whenever genomes/ changes and restarts.
+-include deps.mk
+
+REF_NPZS              := $(SPECIMENS:%=reference_index/%.npz)
+PRESENCE_DONE         := $(SPECIMENS:%=specimen_index_presence/%/index.done)
+PRESENCE_STATS        := $(SPECIMENS:%=stats/indexing_presence/%.stats)
+COUNT_DONE            := $(SPECIMENS:%=specimen_index_count/%/index.done)
+COUNT_STATS           := $(SPECIMENS:%=stats/indexing_count/%.stats)
+VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
+VERIFY_COUNT_STATS    := $(SPECIMENS:%=stats/verify_count/%.stats)
+SPECIFIC_PRESENCE_DONE  := $(SPECIES:%=specific_index_presence/%/index.done)
+SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
+SPECIFIC_COUNT_DONE     := $(SPECIES:%=specific_index_count/%/index.done)
+SPECIFIC_COUNT_STATS    := $(SPECIES:%=stats/specific_kmer_count/%.stats)
+SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
+
+.NOTPARALLEL:
+
+.PHONY: all simulate reference \
+        index_presence index_count \
+        aggregate_index_presence aggregate_index_count \
+        merge_presence merge_count \
+        verify_presence verify_count \
+        aggregate_verify_presence aggregate_verify_count \
+        verify_merge_presence verify_merge_count \
+        filter_presence filter_count \
+        aggregate_filter_presence aggregate_filter_count
+
+verify_merge_presence: stats/verify_merge_presence/current.csv
+verify_merge_count:    stats/verify_merge_count/current.csv
+
+all: aggregate_verify_presence aggregate_verify_count \
+     verify_merge_presence verify_merge_count \
+     aggregate_filter_presence aggregate_filter_count
+
+# ── dependency file ───────────────────────────────────────────────────────────
+
+deps.mk: $(GENOMES)
+	$(VENV_PY) make_deps.py $^ > $@
+
+# ── simulation ────────────────────────────────────────────────────────────────
+# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
+
+$(SIMULATED_READS):
+	bash simulate_one.sh $< $(dir $@)
+
+simulate: $(SIMULATED_READS)
+
+# ── reference kmer sets ───────────────────────────────────────────────────────
+# Prerequisites (reads → npz) are in deps.mk.
+
+reference_index/%.npz:
+	bash build_reference.sh $*
+
+reference: $(REF_NPZS)
+
+# ── per-specimen indexing ─────────────────────────────────────────────────────
+# Prerequisites (reads → index.done + .stats) are in deps.mk.
+
+specimen_index_presence/%/index.done \
+stats/indexing_presence/%.stats &: $(BINARY)
+	bash index_one_presence.sh $*
+
+specimen_index_count/%/index.done \
+stats/indexing_count/%.stats &: $(BINARY)
+	bash index_one_count.sh $*
+
+index_presence: $(PRESENCE_DONE)
+index_count:    $(COUNT_DONE)
+
+# ── indexing stats aggregation ────────────────────────────────────────────────
+
+aggregate_index_presence: $(PRESENCE_STATS)
+	bash aggregate_stats.sh indexing_presence
+
+aggregate_index_count: $(COUNT_STATS)
+	bash aggregate_stats.sh indexing_count
+
+# ── global merge ──────────────────────────────────────────────────────────────
+
+global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
+	bash merge_presence.sh
+
+global_index_count/index.done: $(COUNT_DONE) $(BINARY)
+	bash merge_count.sh
+
+merge_presence: global_index_presence/index.done
+merge_count:    global_index_count/index.done
+
+# ── per-specimen verification ─────────────────────────────────────────────────
+# Prerequisites (index.done + npz → .stats) are in deps.mk.
+
+stats/verify_presence/%.stats:
+	bash verify_one_presence.sh $*
+
+stats/verify_count/%.stats:
+	bash verify_one_count.sh $*
+
+verify_presence: $(VERIFY_PRESENCE_STATS)
+verify_count:    $(VERIFY_COUNT_STATS)
+
+# ── verification stats aggregation ───────────────────────────────────────────
+
+aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
+	bash aggregate_stats.sh verify_presence
+
+aggregate_verify_count: $(VERIFY_COUNT_STATS)
+	bash aggregate_stats.sh verify_count
+
+# ── species-specific indexes ──────────────────────────────────────────────────
+# Prerequisites (global index → specific index) are in deps.mk.
+
+specific_index_presence/%/index.done \
+stats/specific_kmer_presence/%.stats &: $(BINARY)
+	bash filter_one_presence.sh $*
+
+specific_index_count/%/index.done \
+stats/specific_kmer_count/%.stats &: $(BINARY)
+	bash filter_one_count.sh $*
+
+filter_presence: $(SPECIFIC_PRESENCE_DONE)
+filter_count:    $(SPECIFIC_COUNT_DONE)
+
+aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
+	bash aggregate_stats.sh specific_kmer_presence
+
+aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
+	bash aggregate_stats.sh specific_kmer_count
+
+# ── merged index verification ─────────────────────────────────────────────────
+
+stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
+	bash verify_merge_presence.sh
+
+stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
+	bash verify_merge_count.sh
@@ -0,0 +1,132 @@
+# Benchmark pipeline
+
+Requires **GNU Make ≥ 4.3** (grouped targets `&:`).  On macOS use `gmake`.
+
+```
+gmake all          # full pipeline
+gmake simulate     # simulation only
+gmake reference    # reference kmer sets only
+```
+
+## Pipeline overview
+
+```mermaid
+flowchart TD
+    GENOMES["genomes/*.fna.gz"]
+    BIN["obikmer binary"]
+
+    GENOMES --> simulate
+    simulate --> simdata[("simulated_data/")]
+
+    simdata --> reference
+    reference --> refnpz[("reference_index/*.npz")]
+
+    subgraph presence ["Presence track"]
+        simdata  --> index_presence
+        BIN      --> index_presence
+        index_presence --> pres_done[("specimen_index_presence/")]
+        index_presence --> pres_istats[("stats/indexing_presence/")]
+        pres_istats --> aggregate_index_presence
+
+        pres_done --> merge_presence
+        BIN       --> merge_presence
+        merge_presence --> gpres[("global_index_presence/")]
+
+        refnpz    --> verify_presence
+        pres_done --> verify_presence
+        verify_presence --> vpres_stats[("stats/verify_presence/")]
+        vpres_stats --> aggregate_verify_presence
+
+        gpres --> filter_presence
+        BIN   --> filter_presence
+        filter_presence --> spec_pres[("specific_index_presence/")]
+        filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
+        spec_pres_stats --> aggregate_filter_presence
+
+        refnpz --> verify_merge_presence
+        gpres  --> verify_merge_presence
+        verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
+    end
+
+    subgraph count ["Count track"]
+        simdata --> index_count
+        BIN     --> index_count
+        index_count --> count_done[("specimen_index_count/")]
+        index_count --> count_istats[("stats/indexing_count/")]
+        count_istats --> aggregate_index_count
+
+        count_done --> merge_count
+        BIN        --> merge_count
+        merge_count --> gcount[("global_index_count/")]
+
+        refnpz     --> verify_count
+        count_done --> verify_count
+        verify_count --> vcount_stats[("stats/verify_count/")]
+        vcount_stats --> aggregate_verify_count
+
+        gcount --> filter_count
+        BIN    --> filter_count
+        filter_count --> spec_count[("specific_index_count/")]
+        filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
+        spec_count_stats --> aggregate_filter_count
+
+        refnpz --> verify_merge_count
+        gcount --> verify_merge_count
+        verify_merge_count --> vmc[("stats/verify_merge_count/")]
+    end
+
+    aggregate_verify_presence  --> all
+    aggregate_verify_count     --> all
+    vmp                        --> all
+    vmc                        --> all
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
+    all -. "$(MAKE) re-eval" .-> aggregate_filter_count
+```
+
+## Steps
+
+| Target | Script | Description |
+|---|---|---|
+| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
+| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
+| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
+| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
+| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
+| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
+| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
+| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
+| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
+| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
+| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
+| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
+| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
+| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
+| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
+| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
+| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
+| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
+
+## Directory layout
+
+```
+benchmark/
+├── genomes/                        # input reference genomes (.fna.gz)
+├── simulated_data/                 # generated by simulate
+│   └── <species>/<specimen>/
+├── reference_index/                # reference kmer sets (.npz)
+├── specimen_index_presence/        # per-specimen presence indexes
+├── specimen_index_count/           # per-specimen count indexes
+├── global_index_presence/          # merged global presence index
+├── global_index_count/             # merged global count index
+├── specific_index_presence/        # species-specific presence indexes
+├── specific_index_count/           # species-specific count indexes
+└── stats/                          # all benchmark statistics
+    ├── indexing_presence/
+    ├── indexing_count/
+    ├── verify_presence/
+    ├── verify_count/
+    ├── specific_kmer_presence/
+    ├── specific_kmer_count/
+    ├── verify_merge_presence/
+    └── verify_merge_count/
+```
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+# Usage: aggregate_stats.sh TYPE
+# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
+#
+# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
+# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
+# the most recent run CSV (idempotent when nothing changed).
+set -euo pipefail
+
+TYPE="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
+
+case "${TYPE}" in
+    indexing_presence|indexing_count)
+        HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
+        ;;
+    verify_presence)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
+        ;;
+    verify_count)
+        HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
+        ;;
+    specific_kmer_presence|specific_kmer_count)
+        HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
+        ;;
+    *)
+        echo "ERROR: unknown stats type '${TYPE}'" >&2
+        exit 1
+        ;;
+esac
+
+# Find most recent existing run CSV (empty string if none).
+latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
+
+# Check if any .stats file is newer than the latest run CSV.
+if [[ -n "${latest_csv}" ]] && \
+   [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
+    echo "[${TYPE}] stats up to date (${latest_csv})"
+    exit 0
+fi
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+echo "${HEADER}" >"${CSV}"
+
+# Sort .stats files by name for reproducible row order.
+while IFS= read -r stats_file; do
+    sed "s/^/${run_n},/" "${stats_file}"
+done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
+
+echo "[${TYPE}] run ${run_n} → ${CSV}"
@@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+"""Build a reference kmer index from paired-end FASTQ reads.
+
+Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
+counts their abundances, and saves a sorted numpy pair (kmers, counts).
+
+Output .npz arrays
+  kmers  : uint64, sorted ascending — canonical kmer integers
+  counts : uint32, same order      — raw read abundances
+"""
+import argparse
+import gzip
+import sys
+from collections import defaultdict
+
+import numpy as np
+
+
+# ── encoding ────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+# Lookup table: revcomp of one byte (4 bases, 8 bits).
+# Precomputed once at import time.
+_REVCOMP8 = [0] * 256
+for _i in range(256):
+    _rc, _x = 0, _i
+    for _ in range(4):
+        _rc = (_rc << 2) | (3 - (_x & 3))
+        _x >>= 2
+    _REVCOMP8[_i] = _rc
+del _i, _rc, _x
+
+
+def revcomp_int(kmer: int, k: int) -> int:
+    """Reverse-complement of a kmer encoded as an integer (2 bits/base).
+
+    Uses byte-level lookup (4 bases at a time) for speed.
+    """
+    rc = 0
+    bits_left = 2 * k
+    while bits_left > 0:
+        chunk = min(8, bits_left)
+        rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
+        rc = (rc << chunk) | rc_byte
+        kmer >>= chunk
+        bits_left -= chunk
+    return rc
+
+
+# ── FASTQ parsing ────────────────────────────────────────────────────────────
+
+def iter_sequences(path: str):
+    """Yield raw sequences from a (gzipped) FASTQ file."""
+    opener = gzip.open if path.endswith('.gz') else open
+    with opener(path, 'rt') as fh:
+        while True:
+            if not fh.readline():   # '@' header
+                break
+            seq = fh.readline().rstrip('\n')
+            fh.readline()           # '+'
+            fh.readline()           # quality
+            yield seq
+
+
+# ── kmer counting ────────────────────────────────────────────────────────────
+
+def count_kmers(paths: list[str], k: int) -> dict[int, int]:
+    mask = (1 << (2 * k)) - 1
+    counts: dict[int, int] = defaultdict(int)
+    n_reads = 0
+
+    for path in paths:
+        for seq in iter_sequences(path):
+            n_reads += 1
+            kmer = 0
+            run = 0          # consecutive valid bases
+
+            for c in seq:
+                b = _ENCODE.get(c)
+                if b is None:    # N or unexpected character → reset
+                    kmer = 0
+                    run = 0
+                    continue
+                kmer = ((kmer << 2) | b) & mask
+                run += 1
+                if run >= k:
+                    rc = revcomp_int(kmer, k)
+                    counts[kmer if kmer <= rc else rc] += 1
+
+            if n_reads % 100_000 == 0:
+                print(f'  {n_reads:,} reads processed, '
+                      f'{len(counts):,} distinct kmers so far',
+                      file=sys.stderr)
+
+    print(f'  {n_reads:,} reads total, {len(counts):,} distinct kmers',
+          file=sys.stderr)
+    return counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reads', nargs='+', metavar='FASTQ',
+                    help='Input reads (FASTQ, gzip OK)')
+    ap.add_argument('-k', '--kmer-size', type=int, default=31,
+                    metavar='K')
+    ap.add_argument('--min-abundance', type=int, default=1,
+                    metavar='N', help='Drop kmers with count < N (default 1)')
+    ap.add_argument('-o', '--output', required=True,
+                    metavar='FILE', help='Output .npz path')
+    args = ap.parse_args()
+
+    print(f'k={args.kmer_size}  files={len(args.reads)}', file=sys.stderr)
+    counts = count_kmers(args.reads, args.kmer_size)
+
+    if args.min_abundance > 1:
+        before = len(counts)
+        counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
+        print(f'  min-abundance={args.min_abundance}: '
+              f'{before - len(counts):,} kmers dropped, '
+              f'{len(counts):,} retained',
+              file=sys.stderr)
+
+    print(f'Sorting and saving → {args.output}', file=sys.stderr)
+    kmers_arr  = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
+    counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
+
+    np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
+    print(f'Done  {len(kmers_arr):,} kmers  →  {args.output}', file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+BUILD_PY="${SCRIPT_DIR}/build_reference.py"
+
+KMER_SIZE="${KMER_SIZE:-31}"
+MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
+
+mkdir -p "${REF_DIR}"
+
+for species_dir in "${SIMDATA_DIR}"/*/; do
+    [[ -d "${species_dir}" ]] || continue
+    species=$(basename "${species_dir}")
+
+    for strain_dir in "${species_dir}"*/; do
+        [[ -d "${strain_dir}" ]] || continue
+        strain=$(basename "${strain_dir}")
+
+        r1="${strain_dir}/reads_R1.fastq.gz"
+        r2="${strain_dir}/reads_R2.fastq.gz"
+        if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+            echo "SKIP ${species}--${strain}: reads not found" >&2
+            continue
+        fi
+
+        out="${REF_DIR}/${species}--${strain}.npz"
+        echo "[${species}--${strain}] → ${out}"
+
+        "${PYTHON}" "${BUILD_PY}" \
+            --kmer-size      "${KMER_SIZE}" \
+            --min-abundance  "${MIN_ABUNDANCE}" \
+            --output         "${out}" \
+            "${r1}" "${r2}"
+    done
+done
@@ -0,0 +1,199 @@
+SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
+SPECIES   := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
+
+# Escherichia_coli--K-12_MG1655
+simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
+stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
+
+# Escherichia_coli--EDL933
+simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
+reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
+stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
+
+# Salmonella_enterica--LT2
+simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
+reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
+stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
+
+# Escherichia_coli--CFT073
+simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
+reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
+stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
+
+# Bacillus_subtilis--168
+simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
+reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
+stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
+stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
+
+# Salmonella_enterica--P125109
+simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
+reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
+stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
+
+# Shouchella_clausii--KSM-K16
+simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
+reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
+stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
+stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
+
+# Escherichia_coli--K-12_W3110
+simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
+reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
+stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
+stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
+
+# Klebsiella_pneumoniae--MGH_78578
+simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
+stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
+
+# Opitutus_terrae--PB90-1
+simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
+reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
+stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
+stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
+
+# Saccharolobus_islandicus--M.16.4
+simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
+reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
+stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
+stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
+
+# Acidobacterium_capsulatum--ATCC_51196
+simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
+reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
+stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
+stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
+
+# Salmonella_enterica--AKU_12601
+simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
+reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
+stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
+
+# Proteus_mirabilis--HI4320
+simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
+reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
+stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
+stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
+
+# Salmonella_enterica--CT18
+simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
+reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
+stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
+stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
+
+# Klebsiella_pneumoniae--HS11286
+simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
+stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
+
+# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
+simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
+reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
+stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
+
+# Klebsiella_pneumoniae--ATCC_13883
+simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
+reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
+stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
+stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
+
+# Yersinia_ruckeri--YRB
+simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
+reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
+stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
+stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
+
+# Candidozyma_auris--GCF_003013715.1_ASM301371v2
+simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
+reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
+stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
+
+# Escherichia_coli
+specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
+specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
+# Salmonella_enterica
+specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
+specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
+# Bacillus_subtilis
+specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
+specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
+# Shouchella_clausii
+specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
+specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
+# Klebsiella_pneumoniae
+specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
+specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
+# Opitutus_terrae
+specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
+specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
+# Saccharolobus_islandicus
+specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
+specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
+# Acidobacterium_capsulatum
+specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
+specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
+# Proteus_mirabilis
+specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
+specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
+# Wolbachia_endosymbiont
+specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
+specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
+# Yersinia_ruckeri
+specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
+specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
+# Candidozyma_auris
+specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
+specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+assemblies=(
+    GCF_000005845.2
+    GCF_000010245.2
+    GCF_000007445.1
+    GCF_000006665.1
+
+    GCF_000006945.2
+    GCF_000195995.1
+    GCF_000009505.1
+    GCF_000026565.1
+
+    GCF_000016305.1
+    GCF_000019965.1
+    GCF_000240185.1
+    GCF_000742135.1
+
+    GCF_000069965.1
+    GCF_000022565.1
+    GCF_000306885.1
+    GCF_003013715.1
+
+    GCF_000009045.1
+    GCF_000009825.1
+    GCF_000022445.1
+    GCF_000834255.1
+)
+
+mkdir -p genomes
+
+for acc in "${assemblies[@]}"; do
+    echo "Downloading ${acc}"
+
+    datasets download genome accession "${acc}" \
+        --include genome \
+        --filename "${acc}.zip"
+
+    unzip -q "${acc}.zip" -d "${acc}"
+    find "${acc}" -name "*.fna" |
+        while read file; do
+            obiconvert -Z ${file} >genomes/$(basename ${file}).gz
+        done
+
+    rm -rf "${acc}" "${acc}.zip"
+done
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_count.sh SPECIES
+# Filters global_index_count to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_count/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_count/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_count"
+OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (count) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# Usage: filter_one_presence.sh SPECIES
+# Filters global_index_presence to keep only kmers specific to SPECIES,
+# then selects the SPECIES column in-place.
+# Outputs:
+#   specific_index_presence/SPECIES/index.done  (written by obikmer select)
+#   stats/specific_kmer_presence/SPECIES.stats  (one CSV data row, no header)
+set -euo pipefail
+
+SPECIES="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+SOURCE="${SCRIPT_DIR}/global_index_presence"
+OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
+STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
+STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
+
+LOG_FILTER=$(mktemp)
+LOG_SELECT=$(mktemp)
+trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
+
+"${BINARY}" filter \
+    --output "${OUTPUT}" \
+    --force \
+    --ingroup "species=${SPECIES}" \
+    --outgroup all \
+    --min-frac 0.5 \
+    --max-frac 1.0 \
+    --max-outgroup-count 0 \
+    "${SOURCE}" \
+    2>"${LOG_FILTER}"
+
+cat "${LOG_FILTER}" >&2
+
+"${BINARY}" select \
+    --in-place \
+    --group "${SPECIES}:species=${SPECIES}" \
+    --group-op "${SPECIES}:any" \
+    --select "${SPECIES}" \
+    "${OUTPUT}" \
+    2>"${LOG_SELECT}"
+
+cat "${LOG_SELECT}" >&2
+
+python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+def parse_reporter(logfile):
+    stats = {}
+    state = 'scan'
+    with open(logfile, errors='replace') as fh:
+        for raw in fh:
+            line = strip_ansi(raw.rstrip('\n'))
+            s    = line.strip()
+            if state == 'scan':
+                if re.search(r'\bstage\b.*\bwall\b', line):
+                    state = 'in_header'
+            elif state == 'in_header':
+                if is_sep(s): state = 'rows'
+            elif state == 'rows':
+                if is_sep(s): state = 'total'
+                elif s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 4:
+                        stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+            elif state == 'total':
+                if s:
+                    parts = re.split(r'  +', s)
+                    if len(parts) >= 3:
+                        stats['TOTAL'] = (parse_wall(parts[1]),
+                                          parse_rss(parts[3]) if len(parts) > 3 else 0)
+                break
+    return stats
+
+f = parse_reporter(log_filter)
+s = parse_reporter(log_select)
+
+row = [species]
+for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
+    key = 'TOTAL' if stage.endswith('_total') else stage
+    w, r = d.get(key, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Usage: index_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_count/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_count/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --with-counts \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Usage: index_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_presence/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_presence/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
+
+Like C .d files: only target: prerequisites lines, no recipes.
+Recipes stay in the Makefile as generic rules.
+"""
+import gzip
+import re
+import sys
+from pathlib import Path
+
+STOP_WORDS    = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
+                 'endosymbiont', 'of'}
+STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
+
+
+def is_stop(tok):
+    t = tok.lower()
+    return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
+
+
+def sanitize(s):
+    return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
+
+
+def collect_tokens(text):
+    parts = []
+    for tok in text.split():
+        tok = tok.rstrip(',.')
+        if is_stop(tok):
+            break
+        parts.append(sanitize(tok))
+    return '_'.join(filter(None, parts))
+
+
+def parse_organism(defn, gcf_id):
+    words   = defn.split()
+    species = sanitize(words[0] + '_' + words[1])
+
+    m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
+    if m:
+        strain = sanitize(m.group(1))
+        if m.group(2):
+            strain += '_' + sanitize(m.group(2))
+        return species, strain
+
+    m = re.search(r'\bstrain\b\s+(.*)', defn)
+    if m:
+        strain = collect_tokens(m.group(1))
+        if strain:
+            return species, strain
+
+    remainder = re.sub(r'^\S+ \S+\s*', '', defn)
+    remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
+    remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
+    strain    = collect_tokens(remainder)
+    return species, strain if strain else gcf_id
+
+
+def first_definition(path):
+    with gzip.open(path, 'rt') as fh:
+        for line in fh:
+            if line.startswith('>'):
+                m = re.search(r'"definition":"([^"]*)"', line)
+                return m.group(1) if m else line[1:].split()[0]
+    return Path(path).stem
+
+
+def main():
+    entries = []   # (specimen, species, sim_dir, genome_path)
+    species_seen = []
+
+    for path in sorted(sys.argv[1:]):
+        gcf_id  = Path(path).name.replace('_genomic.fna.gz', '')
+        defn    = first_definition(path)
+        sp, st  = parse_organism(defn, gcf_id)
+        specimen = f'{sp}--{st}'
+        sim_dir  = f'simulated_data/{sp}/{st}'
+        entries.append((specimen, sp, sim_dir, path))
+        if sp not in species_seen:
+            species_seen.append(sp)
+
+    specimens = [e[0] for e in entries]
+    print('SPECIMENS :=', ' '.join(specimens))
+    print('SPECIES   :=', ' '.join(species_seen))
+
+    for specimen, species, sim_dir, genome in entries:
+        reads = f'{sim_dir}/reads_R1.fastq.gz'
+        p_done  = f'specimen_index_presence/{specimen}/index.done'
+        p_stats = f'stats/indexing_presence/{specimen}.stats'
+        c_done  = f'specimen_index_count/{specimen}/index.done'
+        c_stats = f'stats/indexing_count/{specimen}.stats'
+        ref     = f'reference_index/{specimen}.npz'
+        vp      = f'stats/verify_presence/{specimen}.stats'
+        vc      = f'stats/verify_count/{specimen}.stats'
+
+        print()
+        print(f'# {specimen}')
+        print(f'{reads}: {genome}')
+        print(f'{ref}: {reads}')
+        print(f'{p_done} {p_stats}: {reads}')
+        print(f'{c_done} {c_stats}: {reads}')
+        print(f'{vp}: {ref} {p_done}')
+        print(f'{vc}: {ref} {c_done}')
+
+    print()
+    for sp in species_seen:
+        sp_done  = f'specific_index_presence/{sp}/index.done'
+        sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
+        sc_done  = f'specific_index_count/{sp}/index.done'
+        sc_stats = f'stats/specific_kmer_count/{sp}.stats'
+        print(f'# {sp}')
+        print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
+        print(f'{sc_done} {sc_stats}: global_index_count/index.done')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
+OUTPUT="${SCRIPT_DIR}/global_index_count"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output  "${OUTPUT}" \
+    --force \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
+OUTPUT="${SCRIPT_DIR}/global_index_presence"
+STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
+
+mkdir -p "${STATS_DIR}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
+CSV="${STATS_DIR}/run_${run_n}.csv"
+
+printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
+
+parse_reporter() {
+    local run="$1" n_sources="$2" logfile="$3"
+    python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
+import sys, re
+
+run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s):
+                state = 'rows'
+        elif state == 'rows':
+            if is_sep(s):
+                state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
+row = [run, n_sources]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF
+}
+
+mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
+
+if [[ ${#sources[@]} -eq 0 ]]; then
+    echo "ERROR: no indexes found in ${IDX_DIR}" >&2
+    exit 1
+fi
+
+echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
+printf '  %s\n' "${sources[@]}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" merge \
+    --output          "${OUTPUT}" \
+    --force \
+    --force-presence \
+    "${sources[@]}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
+
+echo "Done. Run ${run_n} → ${CSV}"
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# Simulate all genomes. Delegates to simulate_one.sh per genome.
+# Prefer running via `gmake simulate` which handles individual dependencies.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
+    out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
+        --dir-for "${genome_file}")
+    bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
+done
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Usage: simulate_one.sh genome.fna.gz output_dir
+# Simulates paired-end HiSeq reads for a single genome.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ISS="${SCRIPT_DIR}/../.venv/bin/iss"
+COVERAGE=15
+READ_LENGTH=150
+CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
+
+genome_file="$1"
+out_dir="$2"
+
+mkdir -p "${out_dir}"
+
+tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
+trap 'rm -f "${tmp_fasta}"' EXIT
+
+gzip -dc "${genome_file}" > "${tmp_fasta}"
+
+genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
+n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
+
+echo "[${out_dir}]  genome=${genome_size} bp  →  ${n_reads} read pairs  (${COVERAGE}x HiSeq)"
+
+"${ISS}" generate \
+    --genomes   "${tmp_fasta}" \
+    --model     HiSeq \
+    --n_reads   "${n_reads}" \
+    --cpus      "${CPUS}" \
+    --compress \
+    --output    "${out_dir}/reads"
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""Compare an obikmer count index against a reference kmer set (presence + counts).
+
+Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
+streams `obikmer dump` from a --with-counts index, then reports:
+  - false negatives : kmers in reference absent from the index
+  - false positives : kmers in the index absent from the reference
+  - count mismatches: kmers present in both but with differing counts
+
+Output to stdout: one CSV row
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
+    """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers, counts = [], []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        parts = line.rstrip('\n').split(',')
+        kmers.append(encode_kmer(parts[0]))
+        counts.append(int(parts[1]))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
+    return (np.array(kmers, dtype=np.uint64)[order],
+            np.array(counts, dtype=np.uint32)[order])
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
+            idx_kmers: np.ndarray, idx_counts: np.ndarray,
+            ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
+
+    All arrays sorted; cm_* cover kmers present in both arrays but with
+    differing counts.
+    """
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers.
+    # Both arrays are sorted so we can use searchsorted.
+    pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask = idx_kmers[pos_in_idx] == ref_kmers
+
+    shared_ref_counts = ref_counts[shared_mask]
+    shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
+    mismatch_mask     = shared_ref_counts != shared_idx_counts
+
+    cm_kmers      = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref_counts = shared_ref_counts[mismatch_mask]
+    cm_idx_counts = shared_idx_counts[mismatch_mask]
+
+    return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?',
+                    help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?',
+                    help='obikmer index directory (built with --with-counts)')
+    ap.add_argument('--obikmer',  default='obikmer',
+                    help='Path to obikmer binary')
+    ap.add_argument('--species',  default='')
+    ap.add_argument('--strain',   default='')
+    ap.add_argument('--header',   action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    ap.add_argument('--save-cm',  metavar='FILE',
+                    help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    # Detect k
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers, idx_counts = load_index(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
+        ref_kmers, ref_counts, idx_kmers, idx_counts)
+
+    n_shared  = len(ref_kmers) - len(false_neg)
+    fn_pct    = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct    = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct    = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'false negatives : {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives : {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+    print(f'count mismatches: {len(cm_kmers):,}  ({cm_pct:.4f}% of shared)',
+          file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+
+    if args.save_cm and len(cm_kmers):
+        with open(args.save_cm, 'w') as fh:
+            fh.write('kmer,ref_count,idx_count\n')
+            for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+                fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+          f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Verify the merged count index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer+count pairs from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_count.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
+  fn_pct,fp_pct,cm_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order
+        per_specimen   : mapping label → (kmer_ints, counts) for entries > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]
+    per_specimen: dict[str, tuple[list[int], list[int]]] = {
+        name: ([], []) for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            count = int(parts[i + 1])
+            if count > 0:
+                per_specimen[name][0].append(kmer_int)
+                per_specimen[name][1].append(count)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     count_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     save_cm: Path | None,
+                     ) -> str:
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    npz        = np.load(ref_path)
+    ref_kmers  = npz['kmers']    # sorted uint64
+    ref_counts = npz['counts']   # uint32
+
+    order      = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
+    idx_kmers  = np.array(kmer_list,  dtype=np.uint64)[order]
+    idx_counts = np.array(count_list, dtype=np.uint32)[order]
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    # Count mismatches among shared kmers
+    pos_in_idx     = np.searchsorted(idx_kmers, ref_kmers)
+    pos_in_idx     = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
+    shared_mask    = idx_kmers[pos_in_idx] == ref_kmers
+    mismatch_mask  = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
+    cm_kmers       = ref_kmers[shared_mask][mismatch_mask]
+    cm_ref         = ref_counts[shared_mask][mismatch_mask]
+    cm_idx         = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
+
+    n_shared = int(shared_mask.sum())
+    fn_pct   = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct   = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+    cm_pct   = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)  '
+          f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    if save_cm and len(cm_kmers):
+        cm_file = save_cm / f'{name}_cm.csv'
+        lines = ['kmer,ref_count,idx_count']
+        for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
+            lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
+        cm_file.write_text('\n'.join(lines) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
+            f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged count index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory for false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory for false-positive kmer lists')
+    ap.add_argument('--save-cm', metavar='DIR',
+                    help='Directory for count-mismatch CSV files')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,count_mismatch,'
+              'fn_pct,fp_pct,cm_pct')
+        return
+
+    ref_dir = Path(args.ref_dir)
+    save_fn = Path(args.save_fn) if args.save_fn else None
+    save_fp = Path(args.save_fp) if args.save_fp else None
+    save_cm = Path(args.save_cm) if args.save_cm else None
+    for d in (save_fn, save_fp, save_cm):
+        if d: d.mkdir(parents=True, exist_ok=True)
+
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        kmers, counts = per_specimen[name]
+        row = compare_specimen(name, kmers, counts, ref_dir, k,
+                               save_fn, save_fp, save_cm)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_count"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+"""Verify the merged presence index against all per-specimen reference sets.
+
+Streams `obikmer dump` once on the merged index, accumulates per-specimen
+kmer sets from each column, then compares each against its reference .npz.
+
+Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
+  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
+"""
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── single-pass dump ──────────────────────────────────────────────────────────
+
+def stream_merged_dump(obikmer_bin: str, index_dir: str,
+                       ) -> tuple[list[str], dict[str, list[int]]]:
+    """Stream the merged dump once.
+
+    Returns:
+        specimen_names : column labels in dump order (excluding 'kmer')
+        per_specimen   : mapping label → list of kmer ints where presence > 0
+    """
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+
+    header_line = proc.stdout.readline().rstrip('\n')
+    cols = header_line.split(',')
+    specimen_names = cols[1:]           # first col is 'kmer'
+    per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
+
+    for line in proc.stdout:
+        parts = line.rstrip('\n').split(',')
+        kmer_int = encode_kmer(parts[0])
+        for i, name in enumerate(specimen_names):
+            if int(parts[i + 1]) > 0:
+                per_specimen[name].append(kmer_int)
+
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+
+    return specimen_names, per_specimen
+
+
+# ── per-specimen comparison ───────────────────────────────────────────────────
+
+def compare_specimen(name: str,
+                     kmer_list: list[int],
+                     ref_dir: Path,
+                     k: int,
+                     save_fn: Path | None,
+                     save_fp: Path | None,
+                     ) -> str:
+    """Compare one specimen column against its reference .npz.
+
+    Returns a CSV row string.
+    """
+    ref_path = ref_dir / f'{name}.npz'
+    if not ref_path.exists():
+        print(f'  SKIP {name}: no reference at {ref_path}', file=sys.stderr)
+        return ''
+
+    species = name.split('--')[0]
+    strain  = name[len(species) + 2:]
+
+    ref_kmers = np.load(ref_path)['kmers']          # sorted uint64
+    idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
+
+    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
+    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'  {name}: ref={len(ref_kmers):,}  idx={len(idx_kmers):,}  '
+          f'fn={len(false_neg):,} ({fn_pct:.4f}%)  '
+          f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
+          file=sys.stderr)
+
+    if save_fn and len(false_neg):
+        fn_file = save_fn / f'{name}_fn.txt'
+        fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
+
+    if save_fp and len(false_pos):
+        fp_file = save_fp / f'{name}_fp.txt'
+        fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
+
+    return (f'{species},{strain},'
+            f'{len(ref_kmers)},{len(idx_kmers)},'
+            f'{len(false_neg)},{len(false_pos)},'
+            f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('index',     metavar='INDEX_DIR', nargs='?',
+                    help='Merged presence index directory')
+    ap.add_argument('ref_dir',   metavar='REF_DIR',   nargs='?',
+                    help='Directory containing per-specimen .npz reference files')
+    ap.add_argument('--obikmer', default='obikmer')
+    ap.add_argument('--header',  action='store_true',
+                    help='Print CSV header and exit')
+    ap.add_argument('--save-fn', metavar='DIR',
+                    help='Directory to save false-negative kmer lists')
+    ap.add_argument('--save-fp', metavar='DIR',
+                    help='Directory to save false-positive kmer lists')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    ref_dir  = Path(args.ref_dir)
+    save_fn  = Path(args.save_fn) if args.save_fn else None
+    save_fp  = Path(args.save_fp) if args.save_fp else None
+    if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
+    if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
+
+    # Detect k
+    out1 = subprocess.check_output(
+        [args.obikmer, 'dump', '--head', '1', args.index],
+        stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    print(f'k={k}  streaming merged dump: {args.index}', file=sys.stderr)
+    specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
+    print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
+
+    for name in specimen_names:
+        row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
+        if row:
+            print(row)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+INDEX="${SCRIPT_DIR}/global_index_presence"
+REF_DIR="${SCRIPT_DIR}/reference_index"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
+
+mkdir -p "${STATS_DIR}"
+
+CURRENT="${STATS_DIR}/current.csv"
+
+"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    "${INDEX}" "${REF_DIR}" \
+    >>"${CURRENT}"
+
+run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
+ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
+cp "${CURRENT}" "${ARCHIVE}"
+
+echo "Done. Results → ${ARCHIVE}"
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying count"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# Usage: verify_one_presence.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
+VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
+INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+echo "[${SPECIMEN}] verifying presence"
+
+"${PYTHON}" "${VERIFY_PY}" \
+    --obikmer "${BINARY}" \
+    --species "${species}" \
+    --strain  "${strain}" \
+    "${REF_NPZ}" "${INDEX_DIR}" \
+    >"${STATS_FILE}"
@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""Compare an obikmer index against a reference kmer set (presence/absence).
+
+Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
+streams the output of `obikmer dump`, encodes each kmer string to uint64,
+then reports false negatives and false positives using numpy set operations.
+
+Output to stdout: one CSV row
+  species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
+"""
+import argparse
+import subprocess
+import sys
+
+import numpy as np
+
+
+# ── encoding ──────────────────────────────────────────────────────────────────
+
+_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
+           'a': 0, 'c': 1, 'g': 2, 't': 3}
+
+_DECODE = ['A', 'C', 'G', 'T']
+
+
+def encode_kmer(s: str) -> int:
+    kmer = 0
+    for c in s:
+        kmer = (kmer << 2) | _ENCODE[c]
+    return kmer
+
+
+def decode_kmer(val: int, k: int) -> str:
+    bases = []
+    for _ in range(k):
+        bases.append(_DECODE[val & 3])
+        val >>= 2
+    return ''.join(reversed(bases))
+
+
+# ── dump parsing ──────────────────────────────────────────────────────────────
+
+def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
+    """Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
+    cmd = [obikmer_bin, 'dump', index_dir]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
+                            text=True)
+    kmers = []
+    header = True
+    for line in proc.stdout:
+        if header:
+            header = False
+            continue
+        kmer_str = line.split(',', 1)[0]
+        kmers.append(encode_kmer(kmer_str))
+    proc.wait()
+    if proc.returncode != 0:
+        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
+        sys.exit(1)
+    arr = np.array(kmers, dtype=np.uint64)
+    arr.sort()
+    return arr
+
+
+# ── comparison ────────────────────────────────────────────────────────────────
+
+def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    """Return (false_negatives, false_positives) as uint64 arrays."""
+    false_neg = np.setdiff1d(ref, idx, assume_unique=True)
+    false_pos = np.setdiff1d(idx, ref, assume_unique=True)
+    return false_neg, false_pos
+
+
+# ── main ─────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__,
+                                 formatter_class=argparse.RawDescriptionHelpFormatter)
+    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?', help='Reference .npz file')
+    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
+    ap.add_argument('--obikmer',  default='obikmer',   help='Path to obikmer binary')
+    ap.add_argument('--species',  default='',          help='Species label for CSV row')
+    ap.add_argument('--strain',   default='',          help='Strain label for CSV row')
+    ap.add_argument('--header',   action='store_true', help='Print CSV header and exit')
+    ap.add_argument('--save-fp',  metavar='FILE',
+                    help='Save false-positive kmer strings to FILE')
+    ap.add_argument('--save-fn',  metavar='FILE',
+                    help='Save false-negative kmer strings to FILE')
+    args = ap.parse_args()
+
+    if args.header:
+        print('species,strain,ref_kmers,idx_kmers,'
+              'false_neg,false_pos,fn_pct,fp_pct')
+        return
+
+    # Detect k from the index (one cheap call before the full dump).
+    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
+    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
+    k = len(out1.splitlines()[1].split(',')[0])
+
+    # Load reference
+    print(f'Loading reference: {args.reference}', file=sys.stderr)
+    npz = np.load(args.reference)
+    ref_kmers = npz['kmers']          # already sorted uint64
+
+    # Load index
+    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
+    idx_kmers = load_index_kmers(args.obikmer, args.index)
+
+    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)
+
+    false_neg, false_pos = compare(ref_kmers, idx_kmers)
+
+    fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
+    fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
+
+    print(f'false negatives: {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
+    print(f'false positives: {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
+
+    if args.save_fn and len(false_neg):
+        with open(args.save_fn, 'w') as fh:
+            for v in false_neg:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
+
+    if args.save_fp and len(false_pos):
+        with open(args.save_fp, 'w') as fh:
+            for v in false_pos:
+                fh.write(decode_kmer(int(v), k) + '\n')
+        print(f'False positives saved → {args.save_fp}', file=sys.stderr)
+
+    print(f'{args.species},{args.strain},'
+          f'{len(ref_kmers)},{len(idx_kmers)},'
+          f'{len(false_neg)},{len(false_pos)},'
+          f'{fn_pct:.4f},{fp_pct:.4f}')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,179 @@
+# NUMA-aware partition runner
+
+## Problem
+
+All partition-level parallel loops in obikindex currently fall into two
+categories:
+
+**Naive Rayon** — used in `build_layers`, `pack_matrices`, `dump`, `select`,
+`stats`, `rebuild`, `reindex`:
+
+```rust
+(0..n).into_par_iter().for_each(|i| work(i));
+```
+
+Threads come from the global Rayon pool with no NUMA awareness.  On
+multi-socket machines this produces cross-socket memory traffic and degrades
+performance super-linearly (see [NUMA-aware worker pools](numa_worker_pools.md)).
+
+**Ad-hoc adaptive pool** — used in `merge`:
+
+A bespoke implementation with pre-spawned workers, channel-based dispatch, and
+activation control.  It handles NUMA correctly but is not reusable.
+
+Both cases should be replaced by a single generic mechanism.
+
+## Unified model
+
+The key insight is that **UMA is just the NUMA case with a single node**.  The
+runner always works the same way: one controller thread per node, each
+independently managing its own workers with the same adaptive logic.  The only
+difference between UMA and NUMA is the number of nodes and whether workers are
+pinned.
+
+```
+NUMA (k nodes)                    UMA (1 node)
+
+controller-0  controller-1  …     controller-0
+    │               │                  │
+workers[0]     workers[1]         workers[0]
+(pinned)       (pinned)           (global pool)
+    └───────────────┴──────────────────┘
+              shared work queue
+```
+
+On each node, the Rayon `ThreadPool` is pinned to that node's CPUs.
+`pool.install()` ensures all internal Rayon calls (inside the work function)
+use the node-local pool.  Linux first-touch then places heap allocations in
+local DRAM automatically.
+
+On UMA the global Rayon pool is used directly — no pinning, no overhead.
+
+## Adaptive mechanism
+
+Each controller follows the same logic regardless of node count:
+
+1. Pre-spawn `workers_per_node` dormant worker threads (blocked on `activate_rx`).
+2. Activate the first worker immediately.
+3. Loop on result channel with a `SPAWN_POLL` timeout:
+   - On result: call `on_done`; check whether to activate the next worker.
+   - On timeout: same check.
+   - Activation criterion: `should_spawn_worker(active, global_efficiency, prev_efficiency)`.
+4. Drop `activate_tx` when done — dormant workers exit cleanly.
+
+**Global CPU efficiency** (`CpuSample`, reads `/proc/stat` on Linux) is used by
+all controllers — no per-node measurement needed.  The signal is coarser than
+per-node efficiency but correct in practice: if any node saturates memory
+bandwidth, the global efficiency drops and all controllers stop activating
+workers.  Using a standard portable primitive avoids platform-specific CPU
+accounting and keeps the implementation clean.
+
+## Proposed API
+
+```rust
+pub struct PartitionRunner {
+    // One entry per NUMA node; one entry total on UMA.
+    nodes: Vec<NodeConfig>,
+}
+
+struct NodeConfig {
+    pool:       Option<Arc<rayon::ThreadPool>>,  // None = global Rayon pool (UMA)
+    cpu_ids:    Vec<usize>,                      // empty = no pinning (UMA)
+    max_workers: usize,
+}
+
+impl PartitionRunner {
+    /// Detect topology and build the runner.
+    /// Returns a single-node runner on UMA / macOS / hwloc failure.
+    pub fn new() -> Self;
+
+    /// Run `f(i)` for every index in `order`, collecting results.
+    ///
+    /// `on_done(i, result, elapsed)` is called under an internal mutex as
+    /// each partition completes — use it for progress bars and aggregation.
+    /// The runner serialises all calls to `on_done` via an internal
+    /// `Arc<Mutex<C>>`, so no `Sync` bound is required on the callback.
+    /// `Send` is required because the Arc clone crosses thread boundaries.
+    ///
+    /// Serialisation is free in practice: a partition takes seconds to
+    /// minutes; the callback takes microseconds.  Contention is negligible.
+    ///
+    /// Returns the first error from `f`, if any.
+    pub fn run<F, R, E, C>(
+        &self,
+        order:   &[usize],
+        f:       F,
+        on_done: C,
+    ) -> Result<(), E>
+    where
+        F: Fn(usize) -> Result<R, E> + Send + Sync,
+        R: Send,
+        E: Send,
+        C: FnMut(usize, R, Duration) + Send;   // Send required, Sync is not
+}
+```
+
+`order` is caller-supplied so each command chooses its scheduling strategy:
+largest-first for `merge`, sequential for `build_layers`, etc.
+
+## Migration examples
+
+### merge.rs (before: ~180 lines of bespoke machinery)
+
+```rust
+let runner = PartitionRunner::new();
+runner.run(
+    &order,
+    |i| dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
+            .map_err(OKIError::Partition),
+    |i, g_len, dur| {
+        pb.inc(1);
+        debug!("partition {i}: done in {:.1}s — {g_len} new kmers", dur.as_secs_f64());
+        part_stats.push(PartStat { id: i, unitig_bytes: partition_sizes[i], g_len });
+    },
+)?;
+```
+
+### index.rs build_layers (before: naive into_par_iter)
+
+```rust
+let order: Vec<usize> = (0..n).collect();
+let runner = PartitionRunner::new();
+runner.run(
+    &order,
+    |i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits)
+            .map_err(OKIError::Partition),
+    |_, n_kmers, _| {
+        total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
+        pb.inc(1);
+    },
+)?;
+```
+
+All other sites (`pack_matrices`, `dump`, `select`, etc.) follow the same
+pattern.
+
+## Placement
+
+`PartitionRunner` lives in `obikindex/src/numa.rs` alongside `NumaSetup`.
+It depends only on standard library primitives and Rayon — no new dependencies.
+
+A single `PartitionRunner` instance can be built once per command invocation
+and reused across multiple `run()` calls (e.g. `merge` runs
+`merge_partitions` then `pack_matrices`).
+
+## Open questions
+
+- **Error handling**: `run` currently returns the first error; remaining errors
+  are dropped.  A `Vec<E>` return would give complete diagnostics.
+
+- **`workers_per_node` tuning**: currently `(cpus / 8).max(3).min(8)`, calibrated
+  for merge on BeeGFS.  I/O-bound commands (`dump`, `select`) may benefit from
+  a higher value.  A per-call override could be added to the API.
+
+- **`on_done` ordering**: the runner serialises calls to `on_done` via an
+  internal `Arc<Mutex<C>>`.  `Send` is required (the Arc clone crosses thread
+  boundaries); `Sync` is not (only one thread holds the lock at a time).
+  Contention is negligible because a partition takes seconds while the callback
+  takes microseconds.  The callback is therefore simple to write (plain
+  `Vec::push`, plain `FnMut`) with no measurable performance cost.
@@ -0,0 +1,97 @@
+# NUMA-aware worker pools for merge
+
+## Problem
+
+The merge command's bottleneck is `compute_degrees` in `obidebruinj`: a random pointer-chase over 20–70 M node hash maps that saturates DRAM bandwidth. When multiple partition workers run concurrently, they contend for the shared memory bus, causing super-linear slowdown (measured: 0.016 µs/node solo → 0.95 µs/node with 4–5 concurrent workers, ×60 degradation).
+
+Modern HPC nodes are multi-socket NUMA machines (observed: 2 sockets × 4 NUMA nodes × 24 cores = 192 cores). Cross-NUMA memory traffic compounds the contention:
+
+- Full 192-core run: ~15 min/partition (×10 worse than M3 Mac)
+- `taskset` restricted to 4 NUMA nodes (96 cores): ~90 s/partition
+- OAR job on 1 NUMA node (24 cores): ~80 s/partition, same throughput as 96 cores
+
+**Conclusion**: the bottleneck is memory bandwidth per NUMA node, not core count. 24 cores on one NUMA node achieve the same throughput as 96 cores across four.
+
+## Strategy
+
+Run N worker groups in parallel, one per NUMA node, each with its own Rayon thread pool whose threads are pinned to the NUMA node's CPUs. Linux's first-touch policy then places graph allocations on local DRAM automatically — no explicit NUMA allocator needed.
+
+Expected throughput: N × single-NUMA throughput. On the 8-NUMA-node HPC: 8 × ~80 s = 9–10 min total instead of >60 min with the current single-pool approach.
+
+## Rayon thread pool isolation
+
+Rayon provides `ThreadPool::install(|| { ... })`: any Rayon call (`par_iter`, `current_num_threads`, etc.) inside the closure uses *that* pool exclusively. Wrapping `merge_partition` in `pool.install()` redirects all downstream Rayon calls — including those in `debruijn.rs` and `partition.rs` — without touching those crates.
+
+```rust
+// worker thread, assigned to NUMA pool `pool`
+pool.install(|| {
+    dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence)
+})
+```
+
+`rayon::current_num_threads()` inside `merge_partition` will return the pool size (e.g. 24), not the global thread count — which is the right value for buffer sizing.
+
+## Thread pinning
+
+`ThreadPoolBuilder::spawn_handler` provides a hook executed for each thread at creation. Inside, `libc::sched_setaffinity` pins the thread to a CPU set:
+
+```rust
+let cpus: Vec<usize> = numa_node_cpus(node); // from /sys/devices/system/node/nodeN/cpulist
+rayon::ThreadPoolBuilder::new()
+    .num_threads(cpus.len())
+    .spawn_handler(move |thread| {
+        let mut b = std::thread::Builder::new();
+        std::thread::Builder::new().spawn(move || {
+            pin_to_cpus(&cpus);   // sched_setaffinity via libc
+            thread.run()
+        })?;
+        Ok(())
+    })
+    .build()?
+```
+
+NUMA topology is read from `/sys/devices/system/node/node*/cpulist` — no `libnuma` dependency required. If the `numa` crate is linked, `numa_available()` / `numa_run_on_node()` are an alternative.
+
+## Memory locality
+
+Linux allocates pages on the NUMA node of the thread that first writes them (first-touch policy). Once Rayon threads are pinned to node N, all graph data built by those threads lands on node N's DRAM. No changes to the allocator, no explicit `numa_alloc_onnode` calls.
+
+## Adaptive spawn criterion
+
+The current criterion uses `std::thread::available_parallelism()` (returns total cores = 192) and `max_workers = n_cores / 2`. With NUMA pools:
+
+- `n_cores` per pool = cores per NUMA node (e.g. 24)
+- `max_workers` per pool = pool size / 2 (e.g. 12)
+- CPU efficiency is measured per pool, not globally
+
+Each NUMA group runs its own independent adaptive pool. Workers are distributed across NUMA groups round-robin or by workload (partition assignment can be pre-split by NUMA group index).
+
+## Required changes
+
+| File | Change |
+|------|--------|
+| `obikindex/src/merge.rs` | Detect NUMA topology; build N `ThreadPool`s with pinned threads; assign each pre-spawned worker to a pool; wrap `merge_partition` in `pool.install()` |
+| `obikindex/src/merge.rs` | Replace `available_parallelism()` with per-NUMA core count for spawn criterion |
+| `obikpartitionner/src/merge_layer.rs` | No change — `merge_partition` already works inside any Rayon context |
+| `obidebruinj/src/debruijn.rs` | No change — `par_iter` and `current_num_threads` are pool-context-aware |
+| `obikpartitionner/src/partition.rs` | No change — same reason |
+
+## Platform guard
+
+NUMA pinning is Linux-only. The fallback is the current single global pool:
+
+```rust
+#[cfg(target_os = "linux")]
+fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { ... }
+
+#[cfg(not(target_os = "linux"))]
+fn build_numa_pools() -> Option<Vec<rayon::ThreadPool>> { None }
+```
+
+When `build_numa_pools()` returns `None` (macOS, UMA, or single-socket), `merge.rs` uses the existing code path unchanged.
+
+## Open questions
+
+- **Partition assignment**: split partitions by NUMA group up-front (static) or use a shared queue with per-group workers stealing from a common pool? Static split is simpler; stealing is better for load balance when partitions vary widely in size.
+- **Intra-NUMA adaptive criterion**: with 24 cores and ~3–5 effective workers per NUMA node, the current marginal-gain criterion needs re-tuning or can be left as-is with per-pool `n_cores = 24`.
+- **I/O**: partition data (unitig files) is on a shared filesystem. With 8 concurrent NUMA groups, I/O concurrency increases 8× — need to verify the filesystem (Lustre or local SSD) can absorb it without becoming the new bottleneck.
@@ -0,0 +1,105 @@
+# Rebuild / filter — column-first design
+
+## Problem with the current two-pass design
+
+`rebuild_partition` currently makes **two full passes** over source data:
+
+**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**.
+
+**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders.
+
+Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file.
+
+## Memory budget
+
+The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible.
+
+The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker.
+
+## Key observation
+
+**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot.
+
+This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly.
+
+## Proposed single-scan design
+
+### Step 1 — column scan → `keep` bitvector
+
+```
+for each column c in source matrix:
+    read column c sequentially (one mmap range)
+    update keep[slot] according to filter contribution of column c
+```
+
+For `GroupQuorumFilter` with ingroup/outgroup:
+- ingroup columns: count presence per slot → `ingroup_count[slot]`
+- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible)
+
+Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO.
+
+### Step 2 — unitig scan → kept kmers + new MPHF
+
+```
+for each kmer in unitig files:
+    old_slot = old_MPHF(kmer)
+    if keep[old_slot]:
+        push kmer into new GraphDeBruijn
+        record (old_slot, kmer)   ← or just old_slot in order
+```
+
+Build new MPHF from `GraphDeBruijn` via `materialize_layer`.
+
+### Step 3 — fill new matrix
+
+Two sub-options:
+
+**Option A — from recorded (old_slot, kmer) pairs:**
+
+```
+for each (old_slot, kmer) in recorded list:
+    new_slot = new_MPHF(kmer)
+    for each column c:
+        new_matrix[new_slot, c] = old_matrix[old_slot, c]
+```
+
+Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`.
+For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`.
+
+**Option B — column-by-column copy using old→new slot mapping:**
+
+Precompute `slot_map: Vec<Option<usize>>` of size `n_slots`:
+- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))`
+
+Then for each source column:
+```
+read source column sequentially
+for each slot where slot_map[slot] = Some(new_slot):
+    write value to new column at new_slot
+```
+
+Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot).
+IO pattern: sequential read of each source column → random write into new column builders.
+
+Option B avoids storing kmer values and works uniformly regardless of filter selectivity.
+
+## Comparison
+
+| | Current | Proposed |
+|---|---|---|
+| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs |
+| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) |
+| Cache behavior | poor (random row access) | good (sequential column scan) |
+| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) |
+
+## Files to modify
+
+- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers`
+- Possibly `src/obicompactvec/` — add column iterator API if not already present
+- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData`
+
+## Open questions
+
+- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access?
+- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)?
+- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row?
@@ -0,0 +1,279 @@
+# Kmer filtering and ingroup/outgroup predicates
+
+The `filter`, `dump`, and `unitig` commands share the same filtering system,
+implemented as a shared `FilterArgs` clap argument group embedded in each command
+via `#[command(flatten)]`. Filters select k-mers based on per-genome quorum
+counts, optionally restricted to **ingroup** and **outgroup** genome sets derived
+from genome metadata. All rules described here apply identically to all three commands.
+
+`filter` additionally accepts `--min-total-count` / `--max-total-count` filters
+that operate on the sum of counts across all genomes.
+
+## Predicate syntax
+
+Each `--ingroup` and `--outgroup` flag takes a predicate of the form:
+
+```
+key OP value1|value2|…
+```
+
+| Operator | Meaning |
+|----------|---------|
+| `*` or `all` | wildcard — every genome matches unconditionally |
+| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` |
+| `key!=v` | negation — genome's `key` equals none of the values |
+| `key~path` | path ancestry — genome's `key` is `path` or a descendant |
+| `key!~path` | not a descendant |
+
+Multiple values separated by `|` are always OR-ed within the predicate.
+
+### Path matching (`~` and `!~`)
+
+Metadata values can represent hierarchical concept paths such as
+`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
+
+Stored taxonomy values always start with `/` (the root of the path).
+Query patterns do **not** need to start with `/` — a leading `/` is an optional
+start anchor, not a requirement.
+
+| Pattern form | Semantics |
+|---|---|
+| `A/B` | contiguous sub-path A then B, anywhere in the value |
+| `/A/B` | value starts with A then B |
+| `A/B$` | value ends with A then B |
+| `/A/B$` | value is exactly A then B |
+| `A@x/B` | A with class `x` followed by B with any class |
+
+- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`.
+- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere.
+
+### Missing metadata key → NA
+
+If a genome does not carry the queried metadata key, the predicate returns **NA**.
+NA propagates through the group evaluation logic (see below), and genomes that
+cannot be classified are **ignored** in all quorum counts.
+
+## Group semantics
+
+### Multiple predicates
+
+| Flag | Combination rule |
+|------|-----------------|
+| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates |
+| `--outgroup` (repeated) | **OR** — genome satisfies any predicate |
+
+### Three-value logic
+
+Each predicate returns `true`, `false`, or `NA` (absent key).
+
+- AND: `false` absorbs everything; `NA` propagates unless already `false`.
+- OR: `true` absorbs everything; `NA` propagates unless already `true`.
+
+### Classification and priority
+
+For each genome:
+
+1. Evaluate `AND(ingroup predicates)` → `in_result`
+2. Evaluate `OR(outgroup predicates)` → `out_result`
+3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup)
+4. Else if `out_result = true` → **Outgroup**
+5. Otherwise → **Uncategorized** (ignored in all quorum counts)
+
+### Implicit groups
+
+| `--ingroup` | `--outgroup` | Effective behaviour |
+|-------------|--------------|---------------------|
+| not set | not set | all genomes form the ingroup |
+| set | not set | only ingroup quorum flags apply |
+| not set | set | only outgroup quorum flags apply |
+| set | set | both constraints apply simultaneously |
+
+## Quorum flags
+
+| Flag | Applies to | Meaning |
+|------|-----------|---------|
+| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes |
+| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes |
+| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes |
+| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes |
+| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes |
+| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes |
+| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes |
+| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes |
+| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N (`filter` only) |
+| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`filter` only) |
+| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) |
+
+**Conditional defaults** — the defaults for `--min-frac` and `--max-outgroup-count` depend on two conditions:
+whether the corresponding group was declared, **and** whether any quorum flag for that group was explicitly set.
+
+> **Rule**: declaring a group activates the smart default **only if no quorum flag for that group is explicitly set**.
+> As soon as any quorum flag for a group is present on the command line, all defaults for that group revert to no-op values.
+
+| `--ingroup` | Any ingroup quorum flag? | `--min-frac` default |
+|-------------|--------------------------|----------------------|
+| not set | — | 0.0 (no-op) |
+| set | no | **1.0** — all ingroup genomes must carry the k-mer |
+| set | yes | 0.0 — user controls quorum explicitly |
+
+| `--outgroup` | Any outgroup quorum flag? | `--max-outgroup-count` default |
+|--------------|---------------------------|-------------------------------|
+| not set | — | outgroup size (no-op) |
+| set | no | **0** — no outgroup genome may carry the k-mer |
+| set | yes | outgroup size — user controls quorum explicitly |
+
+"Any ingroup quorum flag" means any of: `--min-count`, `--max-count`, `--min-frac`, `--max-frac`.  
+"Any outgroup quorum flag" means any of: `--min-outgroup-count`, `--max-outgroup-count`, `--min-outgroup-frac`, `--max-outgroup-frac`.
+
+**Why this rule?** Setting any quorum flag signals explicit intent — the defaults are there to help when the user omits quorum entirely, not to interfere with deliberate constraints. Mixing implicit and explicit quorum on the same group would risk silent incoherence (e.g. `--max-count 0` with an implicit `--min-frac 1.0`).
+
+All other bounds default to 0 / group size / 0.0 / 1.0 regardless of whether groups are declared.
+
+### Validation
+
+After resolving defaults, the following are checked and cause an immediate error:
+
+| Condition | Error |
+|-----------|-------|
+| `--min-count > --max-count` | incoherent bounds |
+| `--min-frac > --max-frac` | incoherent bounds |
+| `--min-outgroup-count > --max-outgroup-count` | incoherent bounds |
+| `--min-outgroup-frac > --max-outgroup-frac` | incoherent bounds |
+| any fraction outside `[0.0, 1.0]` | invalid value |
+
+The check applies to the **effective** values (after defaults are resolved), so an explicit `--max-frac 0.5` with an implicit `--min-frac 1.0` would have been caught — but the rule above prevents that situation from arising in the first place.
+
+Fractions are computed over the size of the classified group, not over total
+genome count. An empty group (no genome classified as ingroup/outgroup) never
+triggers a filter failure.
+
+### Conservative rounding of fraction thresholds
+
+When a fraction threshold `F` is applied to a group of size `N`, the effective
+integer threshold is determined by the direction of the bound:
+
+| Bound | Effective count | Rounding | Rationale |
+|-------|----------------|----------|-----------|
+| `--min-frac F` | k-mer in ≥ ⌈F·N⌉ genomes | **ceil** | stricter — a kmer present in exactly ⌊F·N⌋ genomes does not meet the fraction |
+| `--max-frac F` | k-mer in ≤ ⌊F·N⌋ genomes | **floor** | stricter — a kmer present in ⌈F·N⌉ genomes already exceeds the fraction |
+
+The same rule applies symmetrically to `--min-outgroup-frac` (ceil) and
+`--max-outgroup-frac` (floor). The outgroup direction is not inverted: the
+conservative rounding depends only on whether the bound is a minimum or a
+maximum, not on which group it applies to.
+
+**Example** — `--min-frac 0.5` with an ingroup of 3 genomes:
+`⌈0.5 × 3⌉ = ⌈1.5⌉ = 2` → at least 2 of 3 ingroup genomes must carry the k-mer.
+
+**Implementation note** — the filter evaluates `n / denom < min_frac` directly
+(integer `n`, float comparison) rather than pre-computing `⌈F·N⌉`. This is
+mathematically equivalent for integer counts: `n / N < F` ↔ `n < F·N` ↔
+`n ≤ ⌈F·N⌉ − 1` ↔ `n < ⌈F·N⌉`. No explicit rounding is needed.
+
+## Examples
+
+Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes
+and absent from every other genome in the index:
+
+```sh
+obikmer filter src --output dst \
+  --ingroup  "species=Betula_nana" \
+  --outgroup "*" \
+  --min-count 2 \
+  --max-outgroup-count 0
+```
+
+Keep k-mers found in at least 2 *Betula nana* genomes and absent from all
+other *Betula*:
+
+```sh
+obikmer filter src --output dst \
+  --ingroup  "species=Betula_nana" \
+  --outgroup "genus=Betula" \
+  --min-count 2 \
+  --max-outgroup-count 0
+```
+
+Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade
+and in fewer than 10 % of everything outside *Betulaceae*:
+
+```sh
+obikmer filter src --output dst \
+  --ingroup  "taxon~/Betulaceae/Betula" \
+  --outgroup "taxon!~/Betulaceae" \
+  --min-frac 0.5 \
+  --max-outgroup-frac 0.1
+```
+
+Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*:
+
+```sh
+obikmer filter src --output dst \
+  --ingroup  "genus=Betula" \
+  --outgroup "genus=Alnus" \
+  --outgroup "genus=Carpinus" \
+  --max-outgroup-count 0
+```
+
+To dump only k-mers specific to *Betula nana*:
+
+```sh
+obikmer dump myindex \
+  --ingroup  "species=Betula_nana" \
+  --outgroup "*" \
+  --min-count 1 \
+  --max-outgroup-count 0
+```
+
+To enumerate unitigs of the *Betula*-specific subgraph:
+
+```sh
+obikmer unitig myindex \
+  --ingroup  "genus=Betula" \
+  --outgroup "*" \
+  --min-count 2 \
+  --max-outgroup-count 0
+```
+
+## Command-specific options
+
+### `dump --head N`
+
+Stops output after the first N k-mers that pass all active filters.
+Iteration terminates immediately — subsequent partitions and layers are not scanned.
+Useful for quick inspection of large indexes without loading the entire dataset.
+
+```sh
+obikmer dump myindex --head 100
+obikmer dump myindex --head 20 --ingroup "species=Betula_nana" --min-count 1
+```
+
+### `distance --presence-threshold N`
+
+When computing Jaccard distance on a **count index**, a k-mer is considered present in a genome if its count is ≥ N (default 1).
+This option is independent of the `--presence-threshold` used in filtering.
+
+```sh
+# Jaccard treating kmers with count ≥ 2 as present
+obikmer distance myindex --metric jaccard --presence-threshold 2
+```
+
+This parameter has no effect on presence/absence indexes (where values are already 0/1) or on metrics other than Jaccard.
+
+## Implementation
+
+- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter`
+  using pre-computed ingroup and outgroup index vectors. The heavy logic
+  (predicate parsing, three-value evaluation, genome classification) happens
+  once before any iteration; each k-mer row evaluation is a simple index
+  lookup and counter.
+
+- **`obikmer::cmd::predicate::FilterArgs`** — shared `clap` argument group
+  embedded via `#[command(flatten)]` in `FilterArgs`, `DumpArgs`, and
+  `UnitigArgs`. `FilterArgs::build_filters()` returns a ready-to-use filter
+  list.
+
+- **`obikpartitionner::KmerPartition::iter_partition_kmers`** — accepts
+  `filters: &[Box<dyn KmerFilter>]` and applies them per-kmer before invoking
+  the callback. `filter`, `dump`, and `unitig` all go through this single
+  entry point.
@@ -0,0 +1,207 @@
+# Merge parallelism and memory pressure
+
+## Problem observed
+
+Running `obikmer merge` over 109 indexes (108 sources + 1 bootstrap) on a 192-core machine
+produces a fatal OOM during the `merge_partitions` stage:
+
+```
+memory allocation of 9126805520 bytes failed
+```
+
+A single allocation of ~8.5 GB fails. This is not an aggregate; it is one `malloc` call
+from hashbrown during a HashMap resize.
+
+---
+
+## Root cause
+
+### The merge pipeline per partition
+
+```
+source unitigs.bin
+  → iter_indexed_canonical_kmers()
+  → GraphDeBruijn::push()       ← HashSet<u64> + 1 byte flags, all in RAM
+  → compute_degrees_and_mark_starts()
+  → try_for_each_unitig()
+  → unitigs.bin (new layer)
+  → Layer::build() → MPHF + evidence
+```
+
+`GraphDeBruijn` is a `FastHashMap<CanonicalKmer, AtomicU8>` — a `HashSet<u64>` with
+one flag byte per node. Neighbor lookup is implicit: 4 probes into the same map.
+No edges are stored. The full kmer set of one partition must reside in RAM
+simultaneously to compute degrees and mark unitig starts.
+
+The matrix builders that follow (pass 2) are mmapped files — they do **not** consume
+significant RAM. The pressure is entirely in pass 1.
+
+### Unbounded Rayon parallelism
+
+With 192 cores, Rayon ran up to 192 partitions concurrently. Each partition built its
+own `GraphDeBruijn` accumulating all kmers absent from the destination. Peak memory =
+192 × peak_partition_hashset.
+
+### The 8.5 GB single allocation
+
+hashbrown allocates the entire backing array in one call when rehashing.
+At load factor 7/8: `capacity × (sizeof(K,V) + 1 control byte)`.
+For `(u64, AtomicU8)` with alignment: ~16 bytes per slot.
+
+```
+9 127 MB / 16 bytes ≈ 570 M slots → ~380 M new kmers in one partition
+```
+
+Plausible for the largest partition of 108 Salix/Betula sources (~450 Mbp each).
+
+---
+
+## Partition size distribution
+
+`obikmer utils --partition-stats` measures the sum of `unitigs.bin` file sizes
+per partition across all source indexes (pure `stat()` syscalls, negligible cost).
+
+Observed on a 9-genome pilot (256 partitions):
+
+| Stat | Value |
+|---|---|
+| min    | 30.5 MB |
+| max    | 232.1 MB |
+| mean   | 40.1 MB |
+| median | 37.2 MB |
+| p95    | 47.1 MB |
+| max/median ratio | 6.23× |
+
+The distribution is **bimodal with a heavy tail**:
+- 238/256 partitions in a narrow 30–50 MB band
+- 4 structurally extreme partitions (3–6× the median): 221, 233, 135, 191
+
+These correspond to minimizers over-represented in repetitive regions shared across
+all sources. They are extreme in every run on this dataset.
+
+With 109 sources, outlier partitions do not scale linearly: only kmers **absent from
+the destination** enter the GraphDeBruijn, and inter-source overlap is high for closely
+related species. Partition 221 is the likely trigger for the 8.5 GB crash.
+
+---
+
+## Solution: LFD scheduling + memory budget semaphore
+
+### Principle
+
+Pre-sort partitions by **decreasing estimated size** (First Fit Decreasing — FFD),
+then schedule them through a **continuous memory budget semaphore**. Each worker
+acquires an estimated cost before starting and releases it on completion.
+
+Large partitions run first when the full budget is available; small partitions fill
+the gaps. No hard outlier threshold is needed.
+
+### `MemoryBudget` (`obisys`)
+
+```rust
+pub struct MemoryBudget { … }
+
+impl MemoryBudget {
+    pub fn new(total: u64) -> Self;
+    pub fn acquire(&self, cost: u64);  // blocks until budget available
+    pub fn release(&self, cost: u64);
+    pub fn peak_active(&self) -> usize;
+}
+```
+
+Non-deadlock guarantee: when `active == 0`, acquire always succeeds regardless of cost.
+Without this, a partition whose estimated cost exceeds the total budget would block forever.
+
+### Adaptive expansion factor
+
+The expansion factor converts raw `unitigs.bin` bytes into an estimated GraphDeBruijn
+RAM footprint. hashbrown stores each kmer as `(u64, AtomicU8)` ≈ 16 bytes/kmer at 7/8
+load factor; unitig files encode ≈ 2 bits/base. The ratio depends on average unitig
+length (short unitigs: ~2×; long unitigs: up to ~50×).
+
+**Phase 1 — sequential pilot (worst partition)**
+
+The largest partition runs alone first. Its actual `g.len()` seeds the expansion factor
+before any parallel job starts. `FALLBACK_EXPANSION = 4×` is used only for empty partitions.
+
+```rust
+let worst_g_len = dst_partition.merge_partition(worst_id, …)?;
+//                              ↑ now returns SKResult<usize> (was SKResult<()>)
+
+let seed_expansion = worst_g_len as u64 * 16 * 1000 / worst_bytes;
+let max_expansion = AtomicU64::new(seed_expansion);
+```
+
+**Phase 2 — parallel with adaptive updates**
+
+```rust
+order[1..].into_par_iter().for_each(|&i| {
+    let cost = partition_sizes[i] * max_expansion.load(Relaxed) / 1000;
+    budget.acquire(cost);
+    let g_len = dst_partition.merge_partition(i, …)?;
+    budget.release(cost);          // releases estimated cost, not actual
+
+    let actual = g_len as u64 * 16 * 1000 / partition_sizes[i];
+    max_expansion.fetch_max(actual, Relaxed);  // always pessimistic (max)
+});
+```
+
+`budget.release(cost)` uses the estimated cost, not the actual one. The budget tracks
+reservations, not physical RAM; each partition pays what it promised at acquisition.
+
+**On the safety margin**
+
+There is no separate multiplier `k`. It is redundant with `budget_fraction`: both
+reduce effective concurrency by the same amount. A single parameter is easier to
+calibrate. `budget_fraction = 0.5` (default) reserves half of available RAM for the
+OS, MPHF build, pass 2, and estimation error.
+
+`--budget-fraction` is exposed as a CLI flag — the only escape hatch for pathological
+cases (extreme repetitive content, unusually long unitigs) that still cause OOM.
+
+### RAM source
+
+`obisys::available_memory_bytes()` — wraps `sysinfo::System::available_memory()`,
+falls back to `total / 2` on macOS when the memory compressor returns 0.
+
+---
+
+## Diagnostic report
+
+After the parallel phase, `merge_partition` emits a structured report via `tracing::info!`:
+
+```
+─── merge_partitions memory report ───
+  available RAM : 512.0 GB   budget 50% = 256.0 GB
+  expansion factor — seed: 4.2×  final max: 6.1×  (mean: 1.8×  median: 1.6×)
+  peak concurrent workers: 42
+  expansion factor distribution (256 partitions with data):
+     0.50× –  1.25× │██████████████████████████████ 148
+     1.25× –  2.00× │████████████████████████        82
+     …
+     5.50× –  6.25× │█                                2
+  top partitions by actual expansion factor:
+    partition  221 : 6.10×  (232.1 MB unitigs → 48M kmers, reserved at 4.20×)
+    partition  135 : 5.82×  (127.3 MB unitigs → 24M kmers, reserved at 4.20×)
+    …
+──────────────────────────────────────
+```
+
+Fields useful for diagnosis:
+
+| Field | Interpretation |
+|---|---|
+| `seed` vs `final max` expansion | gap indicates partitions with higher expansion than the worst-by-size |
+| `reserved at X×` | the factor used at acquisition; if much lower than actual, the budget was under-reserved for that partition |
+| `peak concurrent workers` | effective parallelism achieved under the budget constraint |
+| `mean` / `median` expansion | typical dataset characteristic; stable across runs on the same data |
+
+---
+
+## Parameters
+
+| Parameter | Default | CLI flag | Notes |
+|---|---|---|---|
+| `fallback_expansion` | 4× | — | seed for empty partitions only |
+| `budget_fraction` | 0.5 | `--budget-fraction` | reduce if OOM persists |
+| RAM source | `obisys::available_memory_bytes()` | — | falls back to `total/2` on macOS |
@@ -0,0 +1,520 @@
+# obicompactvec — Complete Reference
+
+## Module structure
+
+```
+src/obicompactvec/src/
+  lib.rs            public re-exports
+  views.rs          BitSliceView<'a>, IntSliceView<'a> — zero-copy read views
+  traits.rs         ColumnWeights, CountPartials, BitPartials (matrix aggregation)
+  bitvec.rs         PersistentBitVec, PersistentBitVecBuilder, BitIter
+  reader.rs         PersistentCompactIntVec (read-only)
+  builder.rs        PersistentCompactIntVecBuilder (read-write)
+  tempintvec.rs     TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed)
+  tempbitvec.rs     TempBitVec, TempBitVecBuilder (temp-file-backed)
+  bitmatrix.rs      PersistentBitMatrix, PersistentBitMatrixBuilder
+  intmatrix.rs      PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder
+  colgroup.rs       ColGroup, MatrixGroupOps trait
+  format.rs         file format constants, encode/decode helpers
+  layer_meta.rs     LayerMeta (column metadata)
+  meta.rs           matrix metadata
+```
+
+```mermaid
+graph TD
+    views --> bitvec
+    views --> builder
+    views --> tempbitvec
+    views --> tempintvec
+    views --> bitmatrix
+    views --> intmatrix
+    format --> reader
+    format --> builder
+    reader --> intmatrix
+    reader --> tempintvec
+    builder --> intmatrix
+    builder --> tempintvec
+    bitvec --> tempbitvec
+    bitvec --> bitmatrix
+    tempintvec --> intmatrix
+    tempintvec --> bitmatrix
+    tempbitvec --> intmatrix
+    tempbitvec --> bitmatrix
+    colgroup --> intmatrix
+    colgroup --> bitmatrix
+    layer_meta --> bitmatrix
+    layer_meta --> intmatrix
+    meta --> bitmatrix
+    meta --> intmatrix
+```
+
+---
+
+## Compact int encoding
+
+All integer vectors use the same two-tier encoding regardless of storage backend.
+
+**Primary array** — one `u8` per slot:
+
+- Values **0–254** are stored directly. No overhead.
+- Value **255 is a sentinel**: the slot's actual value is ≥ 255 and lives in the overflow store.
+
+**Overflow store** — maps slot index to a `u32` value ≥ 255:
+
+- In `PersistentCompactIntVecBuilder`: a `HashMap<usize, u32>` in RAM.
+- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search.
+
+```mermaid
+flowchart LR
+    slot --> P["primary[slot]: u8"]
+    P -->|"< 255"| V["value = byte (0–254)"]
+    P -->|"= 255 sentinel"| OV["overflow store"]
+    OV -->|"Builder"| HM["HashMap&lt;usize, u32&gt;\nin RAM"]
+    OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"]
+```
+
+**Key property — sentinel 255 = +∞ on `u8`:**
+
+- `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow
+- `max(a, 255) = 255` → correct sentinel when either side is overflow
+- Only the **both-overflow** case requires reading actual values from the overflow store.
+
+In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.07% of kmer slots are in overflow.
+
+---
+
+## View types
+
+The previous trait hierarchy (`BitSlice`, `BitSliceMut`, `IntSlice`, `IntSliceMut`) has been replaced by two concrete zero-copy view structs with inherent methods. Views are **`Copy`** — passing them is free. All read operations live on these two types.
+
+### `BitSliceView<'a>`
+
+```rust
+#[derive(Clone, Copy)]
+pub struct BitSliceView<'a> { pub(crate) words: &'a [u64], pub(crate) n: usize }
+```
+
+Bit `i` is at `words[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are zero.
+
+| Method | Cost |
+|---|---|
+| `len()`, `is_empty()` | O(1) |
+| `get(slot)` | O(1) |
+| `count_ones()` | POPCNT per word, O(n/64) |
+| `count_zeros()` | `n − count_ones()`, O(n/64) |
+| `iter() -> BitSliceIter<'a>` | O(1) setup, O(n) iteration |
+| `partial_jaccard_dist(other: BitSliceView)` | `(a&b).popcount`, `(a\|b).popcount` per word, O(n/64) |
+| `jaccard_dist(other: BitSliceView)` | from partial, O(n/64) |
+| `hamming_dist(other: BitSliceView)` | `(a^b).popcount` per word, O(n/64) |
+
+`BitSliceIter<'a>`: word-level scan; one word per 64 iterations.
+
+### `IntSliceView<'a>`
+
+```rust
+#[derive(Clone, Copy)]
+pub struct IntSliceView<'a> {
+    pub(crate) primary:      &'a [u8],
+    pub(crate) overflow_raw: &'a [u8],   // sorted [(slot:u64, value:u32)] entries
+    pub(crate) n_overflow:   usize,
+    pub(crate) n:            usize,
+}
+```
+
+`overflow_raw` contains `n_overflow` entries of `OVERFLOW_ENTRY_SIZE` bytes each, sorted by slot. The sort invariant is established at `close()`/`freeze()` time.
+
+| Method | Cost |
+|---|---|
+| `len()`, `is_empty()` | O(1) |
+| `primary_bytes()` | O(1) |
+| `overflow_entries() -> impl Iterator<(usize,u32)>` | O(n_overflow) iteration |
+| `get(slot)` | O(1) primary; binary search O(log k) for overflow slots |
+| `iter() -> IntSliceViewIter<'a>` | merge scan, O(n + k) |
+| `sum()` | byte scan + overflow, O(n + k) |
+| `count_nonzero()` | byte scan, O(n) |
+| Distance methods (`bray_dist`, `euclidean_dist`, `jaccard_dist`, …) | O(n + k) |
+
+`IntSliceViewIter<'a>`: merge scan using `overflow_pos` index. Requires sorted overflow — guaranteed by the construction lifecycle.
+
+**Builder `view()` vs reader `view()`:** `PersistentCompactIntVecBuilder` stores overflow as an unsorted `HashMap`, not raw bytes. Its `view()` returns an `IntSliceView` with `overflow_raw = &[]` and `n_overflow = 0`. This is intentional — the view is primarily useful after `freeze()`. During building, callers that need overflow use `overflow_entries()` directly.
+
+---
+
+## Concrete types
+
+```mermaid
+classDiagram
+    class BitSliceView {
+        +words: &[u64]
+        +n: usize
+        +get(slot) bool
+        +count_ones() u64
+        +iter() BitSliceIter
+        +jaccard_dist/hamming_dist(other: BitSliceView)
+    }
+    class IntSliceView {
+        +primary: &[u8]
+        +overflow_raw: &[u8]
+        +n_overflow: usize
+        +n: usize
+        +get(slot) u32
+        +iter() IntSliceViewIter
+        +overflow_entries() Iterator
+        +bray_dist/euclidean_dist/…(other: IntSliceView)
+    }
+    class PersistentBitVec {
+        -mmap: Mmap
+        -n: usize
+        +view() BitSliceView
+        +get(slot) bool
+        +count_ones/zeros() u64
+        +iter() BitIter
+        +partial_jaccard_dist(&Self) (u64,u64)
+        +jaccard_dist/hamming_dist(&Self) …
+    }
+    class PersistentBitVecBuilder {
+        -mmap: MmapMut
+        -n: usize
+        +view() BitSliceView
+        +set(slot, bool)
+        +or/and/xor/not(BitSliceView)
+        +copy_from(BitSliceView)
+        +close() / finish() → PersistentBitVec
+    }
+    class PersistentCompactIntVec {
+        -mmap: Mmap
+        -n: usize
+        -n_overflow: usize
+        -step: usize
+        -index: Vec~(usize,usize)~
+        +view() IntSliceView
+        +get(slot) u32
+        +iter() Iter
+        +sum/count_nonzero() u64
+        +bray_dist/euclidean_dist/… (&Self)
+    }
+    class PersistentCompactIntVecBuilder {
+        -mmap: MmapMut
+        -n: usize
+        -overflow: HashMap~usize,u32~
+        +view() IntSliceView
+        +set(slot, u32) / get(slot) u32
+        +inc / inc_present / inc_present_fast
+        +inc_predicate / inc_predicate_fast
+        +add/min/max/diff/mask_with(…View)
+        +primary_bytes/primary_bytes_mut()
+        +close() / finish() → PersistentCompactIntVec
+    }
+
+    PersistentBitVec --> BitSliceView : view()
+    PersistentBitVecBuilder --> BitSliceView : view()
+    PersistentCompactIntVec --> IntSliceView : view()
+    PersistentCompactIntVecBuilder --> IntSliceView : view() (primary only)
+    PersistentBitVecBuilder --> PersistentBitVec : close() then open()
+    PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open()
+```
+
+### `PersistentBitVec` / `PersistentBitVecBuilder`
+
+`PersistentBitVec` is the read-only type. `view()` returns a `BitSliceView<'_>` over the mmap word array. Direct inherent methods delegate to the view: `count_ones()`, `count_zeros()`, `partial_jaccard_dist(&Self)`, `jaccard_dist(&Self)`, `hamming_dist(&Self)`.
+
+`BitIter<'a>` — exported iterator for `PersistentBitVec::iter()`:
+
+```rust
+pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize }
+```
+
+`PersistentBitVecBuilder` is the read-write type. Mutation operations accept `BitSliceView<'_>`:
+
+| Method | Cost |
+|---|---|
+| `set(slot, bool)` | O(1) |
+| `view() -> BitSliceView<'_>` | O(1) |
+| `or/and/xor(BitSliceView)` | word-level, O(n/64), SIMD-friendly |
+| `not()` | `w ^= u64::MAX` per word, re-masks last word | O(n/64) |
+| `copy_from(BitSliceView)` | `copy_from_slice` | O(n/64) |
+
+### `PersistentCompactIntVec` / `PersistentCompactIntVecBuilder`
+
+`PersistentCompactIntVec` is the read-only type. `view()` returns an `IntSliceView<'_>` over the mmap primary and overflow arrays. Inherent `iter()` is a merge scan (`Iter` struct). Inherent `sum()` and `count_nonzero()` use fast byte-scan helpers.
+
+`PersistentCompactIntVecBuilder` is the read-write type. Mutation methods on the builder fall into two categories:
+
+**Point mutations:**
+
+| Method | Note |
+|---|---|
+| `set(slot, u32)` | writes primary[slot] or 255+overflow |
+| `get(slot) -> u32` | reads primary byte or HashMap |
+| `inc(slot)` | `get` + `set`, O(1) |
+
+**Bulk computation methods** — accept view arguments:
+
+| Method | Semantics | Overflow |
+|---|---|---|
+| `inc_present(BitSliceView)` | `+= 1` at each 1-bit | via `inc`, safe for any group size |
+| `inc_present_fast(BitSliceView)` | same, raw u8 `+= 1` | `debug_assert` no 255 reached |
+| `inc_predicate(IntSliceView, pred)` | `+= 1` where `pred(col[s])` | two-pass, safe |
+| `inc_predicate_fast(IntSliceView, pred)` | same, raw u8 | `debug_assert` no 255 reached |
+| `add(IntSliceView)` | `self[s] += other[s]` | primary fast path + overflow fallback |
+| `min(IntSliceView)` | byte min + both-overflow fixup | see algorithm below |
+| `max(IntSliceView)` | pre-pass + byte max | see algorithm below |
+| `diff(IntSliceView)` | saturating sub | self<255 hot path |
+| `mask_with(BitSliceView)` | zeros slots where mask bit = 0 | O(n_zeros) |
+
+**`inc_present_fast` / `inc_predicate_fast` invariant:** caller guarantees no counter reaches 255 during the operation (group size < 255 for `inc_present_fast`, or chunk size < 255 for `inc_predicate_fast`). Violation is caught by `debug_assert` in dev builds.
+
+**`min` algorithm:**
+
+Exploits 255 = +∞: byte-level min is correct unless both sides are overflow.
+
+```
+snapshot self_ov: Vec<(slot,val)>
+snapshot other_ov: HashMap<slot,val>
+clear_overflow()
+Pass 1 — byte min, SIMD-vectorizable, O(n)
+Pass 2 — both-overflow fixup, O(k_self):
+  for (slot, self_val) in self_ov:
+    if slot ∈ other_ov: set(slot, min(self_val, other_ov[slot]))
+```
+
+**`max` algorithm:**
+
+Cannot do byte max first — `max(255, b<255)=255` overwrites self's original overflow value. Pre-pass reads self's value at other's overflow slots before the byte pass.
+
+```
+Pre-pass O(k_other): for (slot, other_val) in other.overflow_entries():
+  set(slot, max(self.get(slot), other_val))
+Pass 1 — byte max, SIMD-vectorizable, O(n)
+```
+
+---
+
+## Matrix types
+
+Four matrix types, two encodings × two formats:
+
+| | Columnar format | Packed format |
+|---|---|---|
+| **Bit** | `PersistentBitMatrix` (Columnar variant) | `PersistentBitMatrix` (Packed variant) |
+| **Int** | `PersistentCompactIntMatrix` (Columnar variant) | `PersistentCompactIntMatrix` (Packed variant) |
+
+Both matrix types are enums (`Columnar` / `Packed` / `Implicit` for bit) behind a transparent API. `col_view(c)` returns the appropriate view directly:
+
+```rust
+// PersistentBitMatrix
+pub fn col_view(&self, c: usize) -> BitSliceView<'_>
+
+// PersistentCompactIntMatrix
+pub fn col_view(&self, c: usize) -> IntSliceView<'_>
+```
+
+No wrapper enums (`BitColView`, `IntColView`): the caller receives a `Copy` view struct immediately usable with any view method or bulk builder method.
+
+`pack_compact_int_matrix` and `pack_bit_matrix` convert columnar → packed format.
+
+---
+
+## Aggregation traits (matrix level)
+
+### ColumnWeights
+
+```rust
+trait ColumnWeights: Send + Sync {
+    fn col_weights(&self) -> Array1<u64>;         // sum per column
+    fn partial_kmer_counts(&self) -> Array1<u64>; // default = col_weights()
+}
+```
+
+`partial_kmer_counts` is overridden for count matrices to return `count_nonzero` per column (distinct kmers) rather than total count.
+
+### CountPartials
+
+Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`.
+
+**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter.
+
+**`partial_threshold_jaccard` returns `(inter, union)`** because `union[i,j]` depends on both columns simultaneously.
+
+Provided finalisations:
+
+| Finalisation | Formula |
+|---|---|
+| `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` |
+| `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` |
+| `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` |
+| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` |
+| `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` |
+| `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` |
+| `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` |
+
+### BitPartials
+
+Required: `partial_jaccard() -> (Array2<u64>, Array2<u64>)`, `partial_hamming() -> Array2<u64>`. Both additive across layers and partitions.
+
+---
+
+## Temp-file-backed types
+
+**All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory.
+
+### Lifecycle
+
+```
+TempCompactIntVecBuilder::new(n)   →  writable mmap in TempDir
+     ↓  (inc_present_fast / inc_predicate_fast / add / mask_with / …)
+ .freeze()                          →  TempCompactIntVec  (read-only mmap + TempDir)
+     ↓  (optional)
+ .make_persistent(path)             →  PersistentCompactIntVec  (permanent file)
+```
+
+Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`.
+
+**Drop order**: `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }` — Rust drops fields in declaration order. `vec` (mmap) released before `_temp` (directory deleted). No explicit `drop()` needed.
+
+### TempCompactIntVec / TempCompactIntVecBuilder
+
+```rust
+pub struct TempCompactIntVec {
+    vec:   PersistentCompactIntVec,
+    _temp: TempDir,        // dropped after vec
+}
+
+pub(crate) struct TempCompactIntVecBuilder {
+    builder: PersistentCompactIntVecBuilder,
+    temp:    TempDir,
+}
+```
+
+`TempCompactIntVec`: read access via `get(slot)`, `sum()`, `iter()`, `view() -> IntSliceView<'_>`.
+
+`TempCompactIntVecBuilder`: full delegation to inner `PersistentCompactIntVecBuilder` — all bulk computation methods (`inc_present_fast`, `inc_predicate_fast`, `add`, `min`, `max`, `diff`, `mask_with`) are exposed as `pub(crate)`.
+
+### TempBitVec / TempBitVecBuilder
+
+```rust
+pub struct TempBitVec {
+    vec:   PersistentBitVec,
+    _temp: TempDir,
+}
+
+pub(crate) struct TempBitVecBuilder {
+    builder: PersistentBitVecBuilder,
+    temp:    TempDir,
+}
+```
+
+`TempBitVec`: read access via `get(slot)`, `count_ones()`, `view() -> BitSliceView<'_>`, `iter()`.
+
+`TempBitVecBuilder`: exposes `set(slot, bool)`, `or(BitSliceView)`, and:
+
+```rust
+pub(crate) fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool)
+```
+
+`or_where` — two passes, no intermediate allocation:
+
+```
+Pass 1 — primary bytes, O(n):
+  for slot in 0..n:
+    b = col.primary_bytes()[slot]
+    if b < 255 AND pred(b as u32): self.set(slot, true)
+
+Pass 2 — overflow, O(k):
+  for (slot, val) in col.overflow_entries():
+    if pred(val): self.set(slot, true)
+```
+
+---
+
+## Filter / Select API
+
+### ColGroup
+
+```rust
+pub struct ColGroup { pub name: String, pub indices: Vec<usize> }
+```
+
+Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions — column structure is identical across the entire hierarchy; only rows (kmer slots) are partitioned.
+
+### Composition axis
+
+- **Across partitions**: kmer space is partitioned → partial results **concatenated** (disjoint kmer ranges).
+- **Across layers**: same kmer space, different counts → partial results **aggregated** (add, OR, etc.).
+
+### MatrixGroupOps
+
+Five required primitives + two default methods derived from them. All return temp-file-backed types.
+
+```rust
+pub trait MatrixGroupOps {
+    // required
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempCompactIntVec>;
+    fn partial_group_sum(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;
+    fn partial_group_min(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
+    fn partial_group_max(&self, g: &ColGroup)
+        -> io::Result<TempCompactIntVec>;
+
+    // defaults derived from partial_group_presence_count
+    fn partial_group_all(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;   // slot=1 iff count == g.indices.len()
+    fn partial_group_none(&self, g: &ColGroup, threshold: u32)
+        -> io::Result<TempBitVec>;   // slot=1 iff count == 0
+}
+```
+
+Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`.
+
+For **bit matrices**: values are 0/1, so `partial_group_sum` = `partial_group_presence_count(g, 1)`; `partial_group_min` is AND (set first column then mask-with remaining); `partial_group_max` is OR via `partial_group_any` + `inc_present`.
+
+**`partial_group_presence_count` — chunking for large groups:**
+
+When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int) — raw u8 increment, no overflow entry written.
+
+When `g.indices.len() ≥ 255`: process in chunks of 254 columns, accumulate via `.add(chunk_frozen.view())`.
+
+**`partial_group_min` (int matrix)**: copy first column via `.add(col_view(first))` (start from 0 ⇒ copy), then `.min(col_view(c))` for remaining.
+
+**`partial_group_max` (int matrix)**: `.max(col_view(c))` for all columns (start from 0 ⇒ first column acts as copy).
+
+**`partial_group_any`** uses `or_where` on `TempBitVecBuilder` (two-pass: primary bytes then overflow entries).
+
+**`partial_group_all` / `partial_group_none`** (default): call `partial_group_presence_count`, then iterate slots to produce the bit result. O(n) extra pass, not chunked.
+
+### add_col_from — matrix builder integration
+
+Both matrix builders accept temp-file results directly:
+
+```rust
+// PersistentBitMatrixBuilder
+fn add_col_from(&mut self, src: &TempBitVec)         -> io::Result<()>
+fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()>  // nonzero → 1
+
+// PersistentCompactIntMatrixBuilder
+fn add_col_from(&mut self, src: &TempCompactIntVec)  -> io::Result<()>
+fn add_col_from_bit(&mut self, src: &TempBitVec)     -> io::Result<()>  // bit → 0/1 u32
+```
+
+`add_col_from` copies the temp file to the matrix directory and increments `n_cols`; `close()` writes `meta.json` with the final column count. No separate `write_meta` step needed.
+
+### mask_with
+
+Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones.
+
+```
+for (w_idx, word) in mask.words():
+  if word == u64::MAX: continue   // skip all-ones words
+  zeros = !word
+  while zeros != 0:
+    bit = trailing_zeros(zeros)
+    s = w_idx * 64 + bit
+    if primary[s] != 0: set(s, 0)   // clears overflow entry too
+    zeros &= zeros − 1
+```
+
+Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF).
@@ -0,0 +1,143 @@
+# `obitaxonomy` — taxonomy concept paths
+
+`obitaxonomy` is a dependency-free crate that defines a typed representation
+of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata.
+
+---
+
+## Concept path syntax
+
+A concept path is stored as a metadata value with the prefix `taxonomy:/`:
+
+```
+taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species
+```
+
+Structure:
+
+- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting
+  with it is parsed as a `TaxPath`; all others remain plain strings.
+- The remainder is one or more `/`-separated segments.
+- Each segment is `name` or `name@rank`, where `rank` is a label for the
+  taxonomic level (e.g. `family`, `genus`, `species`).
+- Rank annotations are **optional per segment** and can be mixed freely.
+- Spaces are allowed in both names and ranks.
+
+### Reserved character
+
+`@` is reserved throughout the taxonomy system and may **not** appear in:
+
+| Context | Constraint |
+|---------|------------|
+| Segment name | forbidden |
+| Rank/class label | forbidden |
+| Metadata key names | forbidden (used as `key@rank` in predicate syntax) |
+
+`@` is freely allowed in plain-text metadata values (non-taxonomy).
+
+### Parse errors
+
+| Condition | Error |
+|-----------|-------|
+| Value does not start with `taxonomy:/` | `MissingPrefix` |
+| No segments after the prefix | `EmptyPath` |
+| Segment with empty name (consecutive `/`) | `EmptySegmentName` |
+| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` |
+| Segment with more than one `@` | `AmbiguousRank` |
+
+---
+
+## Public API
+
+### `TaxSegment`
+
+A single node: a name and an optional rank.
+
+```rust
+seg.name()            // &str
+seg.rank()            // Option<&str>
+seg.to_string()       // "name" or "name@rank"
+TaxSegment::parse(s)  // Result<TaxSegment, TaxError>
+```
+
+### `TaxPath`
+
+```rust
+TaxPath::parse(s)               // Result<TaxPath, TaxError>
+path.segments()                 // &[TaxSegment]
+path.depth()                    // usize — number of segments
+path.is_ancestor_of(&other)     // bool — prefix match by name, ranks ignored
+path.name_at_rank("genus")      // Option<&str>
+path.to_string()                // reconstructs "taxonomy:/…"
+```
+
+`is_ancestor_of` compares segment **names** only — rank annotations are
+informational and do not affect the ancestry relation.
+
+```rust
+let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?;
+let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?;
+
+assert!(a.is_ancestor_of(&b));   // true
+assert!(b.is_ancestor_of(&a));   // false
+assert!(a.is_ancestor_of(&a));   // true  (equal ⇒ ancestor)
+
+assert_eq!(b.name_at_rank("species"), Some("Escherichia coli"));
+assert_eq!(b.name_at_rank("genus"),   Some("Escherichia"));
+assert_eq!(b.name_at_rank("order"),   None);
+```
+
+---
+
+## Integration with `GenomeInfo`
+
+At index load time, every metadata value is inspected once:
+
+- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`.
+- Otherwise → kept as-is in `genome.meta`.
+
+```rust
+struct GenomeInfo {
+    label:    String,
+    meta:     HashMap<String, String>,    // plain text metadata
+    taxonomy: HashMap<String, TaxPath>,   // parsed taxonomy metadata
+}
+```
+
+The raw string is not duplicated. `TaxPath::to_string()` reconstructs the
+original value losslessly for serialisation.
+
+---
+
+## Predicate operators (in `filter` / `select`)
+
+Path predicates use the `~` / `!~` operators. The **stored value** always starts
+with `/` (rooted path); the **query pattern** does not need to.
+
+### Path pattern syntax
+
+| Pattern | Semantics |
+|---------|-----------|
+| `A/B` | contiguous sub-path A then B, anywhere in the value |
+| `/A/B` | value starts with A then B (start-anchored) |
+| `A/B$` | value ends with A then B (end-anchored) |
+| `/A/B$` | value is exactly A then B (fully anchored) |
+| `A@x/B` | A with class `x` followed by B with any class |
+| `A@x/B@y` | A with class `x` followed by B with class `y` |
+
+A segment pattern without `@` matches the segment name regardless of its stored class.
+
+### Rank-aware queries
+
+```
+key@rank=value
+```
+
+| Predicate form | Semantics |
+|----------------|-----------|
+| `key@rank=value` | genome's `key` has `value` at rank `rank` |
+| `key@rank!=value` | does not |
+| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` |
+
+`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined
+and is rejected at parse time.
@@ -1,188 +0,0 @@
-# Kmer filtering and ingroup/outgroup predicates
-
-The `rebuild`, `dump`, and `unitig` commands all share the same filtering
-system. Filters can select k-mers based on per-genome quorum counts, optionally
-restricted to **ingroup** and **outgroup** genome sets derived from genome
-metadata.
-
-`rebuild` additionally accepts `--min-total-count` / `--max-total-count` filters
-that operate on the sum of counts across all genomes.
-
-## Predicate syntax
-
-Each `--ingroup` and `--outgroup` flag takes a predicate of the form:
-
-```
-key OP value1|value2|…
-```
-
-| Operator | Meaning |
-|----------|---------|
-| `*` or `all` | wildcard — every genome matches unconditionally |
-| `key=v1\|v2` | exact match — genome's `key` equals `v1` or `v2` |
-| `key!=v` | negation — genome's `key` equals none of the values |
-| `key~path` | path ancestry — genome's `key` is `path` or a descendant |
-| `key!~path` | not a descendant |
-
-Multiple values separated by `|` are always OR-ed within the predicate.
-
-### Path matching (`~` and `!~`)
-
-Metadata values can represent hierarchical taxonomic paths such as
-`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
-
- **Absolute pattern** (starts with `/`): the value must start with the pattern
-  at a segment boundary.
-  `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
-  `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
- **Bare segment** (no leading `/`): the value must contain the pattern as an
-  exact path component anywhere.
-  `taxon~Betula` matches any path that has `Betula` as one of its segments.
-
-### Missing metadata key → NA
-
-If a genome does not carry the queried metadata key, the predicate returns **NA**.
-NA propagates through the group evaluation logic (see below), and genomes that
-cannot be classified are **ignored** in all quorum counts.
-
-## Group semantics
-
-### Multiple predicates
-
-| Flag | Combination rule |
-|------|-----------------|
-| `--ingroup` (repeated) | **AND** — genome must satisfy all predicates |
-| `--outgroup` (repeated) | **OR** — genome satisfies any predicate |
-
-### Three-value logic
-
-Each predicate returns `true`, `false`, or `NA` (absent key).
-
- AND: `false` absorbs everything; `NA` propagates unless already `false`.
- OR: `true` absorbs everything; `NA` propagates unless already `true`.
-
-### Classification and priority
-
-For each genome:
-
-1. Evaluate `AND(ingroup predicates)` → `in_result`
-2. Evaluate `OR(outgroup predicates)` → `out_result`
-3. If `in_result = true` → **Ingroup** (ingroup wins over outgroup)
-4. Else if `out_result = true` → **Outgroup**
-5. Otherwise → **Uncategorized** (ignored in all quorum counts)
-
-### Implicit groups
-
-| `--ingroup` | `--outgroup` | Effective behaviour |
-|-------------|--------------|---------------------|
-| not set | not set | all genomes form the ingroup |
-| set | not set | only ingroup quorum flags apply |
-| not set | set | only outgroup quorum flags apply |
-| set | set | both constraints apply simultaneously |
-
-## Quorum flags
-
-| Flag | Applies to | Meaning |
-|------|-----------|---------|
-| `--min-count N` | ingroup | k-mer present in at least N ingroup genomes |
-| `--max-count N` | ingroup | k-mer present in at most N ingroup genomes |
-| `--min-frac F` | ingroup | k-mer present in at least fraction F of ingroup genomes |
-| `--max-frac F` | ingroup | k-mer present in at most fraction F of ingroup genomes |
-| `--min-outgroup-count N` | outgroup | k-mer present in at least N outgroup genomes |
-| `--max-outgroup-count N` | outgroup | k-mer present in at most N outgroup genomes |
-| `--min-outgroup-frac F` | outgroup | k-mer present in at least fraction F of outgroup genomes |
-| `--max-outgroup-frac F` | outgroup | k-mer present in at most fraction F of outgroup genomes |
-| `--min-total-count N` | all genomes | sum of per-genome counts ≥ N (`rebuild` only) |
-| `--max-total-count N` | all genomes | sum of per-genome counts ≤ N (`rebuild` only) |
-| `--presence-threshold N` | all | per-genome count > N to be considered "present" (default 0) |
-
-Defaults: mins = 0 (no lower bound), max counts = group size, max fracs = 1.0
-(no upper bound). A filter with all defaults is a no-op.
-
-Fractions are computed over the size of the classified group, not over total
-genome count. An empty group (no genome classified as ingroup/outgroup) never
-triggers a filter failure.
-
-## Examples
-
-Keep k-mers specific to *Betula nana* — present in at least 2 *B. nana* genomes
-and absent from every other genome in the index:
-
-```sh
-obikmer rebuild src --output dst \
-  --ingroup  "species=Betula_nana" \
-  --outgroup "*" \
-  --min-count 2 \
-  --max-outgroup-count 0
-```
-
-Keep k-mers found in at least 2 *Betula nana* genomes and absent from all
-other *Betula*:
-
-```sh
-obikmer rebuild src --output dst \
-  --ingroup  "species=Betula_nana" \
-  --outgroup "genus=Betula" \
-  --min-count 2 \
-  --max-outgroup-count 0
-```
-
-Use taxonomic paths — keep k-mers present in ≥ 50 % of the *Betula* clade
-and in fewer than 10 % of everything outside *Betulaceae*:
-
-```sh
-obikmer rebuild src --output dst \
-  --ingroup  "taxon~/Betulaceae/Betula" \
-  --outgroup "taxon!~/Betulaceae" \
-  --min-frac 0.5 \
-  --max-outgroup-frac 0.1
-```
-
-Multiple outgroup predicates (OR): exclude k-mers present in *Alnus* or *Carpinus*:
-
-```sh
-obikmer rebuild src --output dst \
-  --ingroup  "genus=Betula" \
-  --outgroup "genus=Alnus" \
-  --outgroup "genus=Carpinus" \
-  --max-outgroup-count 0
-```
-
-The same flags work identically for `dump` and `unitig`. To dump only k-mers
-specific to *Betula nana*:
-
-```sh
-obikmer dump myindex \
-  --ingroup  "species=Betula_nana" \
-  --outgroup "*" \
-  --min-count 1 \
-  --max-outgroup-count 0
-```
-
-To enumerate unitigs of the *Betula*-specific subgraph:
-
-```sh
-obikmer unitig myindex \
-  --ingroup  "genus=Betula" \
-  --outgroup "*" \
-  --min-count 2 \
-  --max-outgroup-count 0
-```
-
-## Implementation
-
- **`obikpartitionner::filter::GroupQuorumFilter`** — implements `KmerFilter`
-  using pre-computed ingroup and outgroup index vectors. The heavy logic
-  (predicate parsing, three-value evaluation, genome classification) happens
-  once before any iteration; each k-mer row evaluation is a simple index
-  lookup and counter.
-
- **`obikmer::cmd::predicate::FilterArgs`** — shared `clap` argument group
-  embedded via `#[command(flatten)]` in `RebuildArgs`, `DumpArgs`, and
-  `UnitigArgs`. `FilterArgs::build_filters()` returns a ready-to-use filter
-  list.
-
- **`obikpartitionner::KmerPartition::iter_partition_kmers`** — accepts
-  `filters: &[Box<dyn KmerFilter>]` and applies them per-kmer before invoking
-  the callback. `rebuild`, `dump`, and `unitig` all go through this single
-  entry point.
@@ -0,0 +1,234 @@
+# `select` — column projection and aggregation
+
+`select` transforms an index by operating on its **genome columns**: projecting a
+subset of columns, aggregating groups of genomes into synthetic columns, or both.
+It is the column-axis counterpart of `filter` (row-axis operations).
+
+Following relational algebra conventions:
+
+| Command  | Relational operation | Axis     |
+|----------|---------------------|----------|
+| `filter` | σ — selection       | rows (k-mers) |
+| `select` | π — projection      | columns (genomes) |
+
+The two commands compose naturally: run `filter` first to restrict the kmer set,
+then `select` to reshape the genome columns.
+
+`select` never changes the kmer set. The MPHF and `unitigs.bin` of each layer
+are preserved unchanged; only the data matrices are rewritten.
+
+---
+
+## Synopsis
+
+```sh
+obikmer select <input-index>
+        { --output <dir> | --in-place }
+        [--group    <name>:<pred>  ...]
+        [--group-op <name>:<op>    ...]
+        [--aggregate-by <key>          ]
+        [--aggregate-op <op>           ]
+        [--select   <col1,col2,...>    ]
+        [--presence-threshold <N>      ]
+```
+
+---
+
+## Output destination
+
+Exactly one of `--output` or `--in-place` must be specified.
+
+**`--output <dir>`** — writes a new index to `<dir>`. The source index is
+unchanged. The MPHF and unitig files are copied; only the data matrices are
+rewritten with the new column layout.
+
+**`--in-place`** — rewrites the data matrices of the source index directly.
+Removed or replaced columns are lost. The operation writes to temporary files
+first, then renames atomically, so an interrupted run leaves the index intact.
+
+---
+
+## Defining output columns
+
+### Named groups — `--group`
+
+```
+--group <name>:<pred>
+```
+
+Defines a named group of genomes using the same predicate syntax as `filter`.
+Repeatable; a genome can belong to several groups.
+
+```sh
+--group "pub:species=Betula_pubescens"
+--group "nan:species=Betula_nana"
+```
+
+### Per-group operator — `--group-op`
+
+```
+--group-op <name>:<op>
+```
+
+Assigns an aggregation operator to a named group. Optional; if absent, the
+default operator applies (see below).
+
+```sh
+--group-op "pub:any"
+--group-op "nan:all"
+```
+
+### Shorthand — `--aggregate-by` / `--aggregate-op`
+
+`--aggregate-by <key>` automatically creates one group per unique value of the
+metadata key `<key>`. Equivalent to one `--group <val>:<key>=<val>` per distinct
+value. `--aggregate-op <op>` sets the operator for all auto-generated groups.
+
+`--aggregate-by` and `--group` are mutually exclusive.
+
+### Column selection and ordering — `--select`
+
+```
+--select col1,col2,...
+```
+
+Lists the output columns in order. Each element is either a group name (defined
+by `--group` or generated by `--aggregate-by`) or a genome label from the source
+index (pass-through, no aggregation).
+
+**Default when `--select` is absent:**
+all defined groups in declaration order (for `--group`), or all generated groups
+in metadata-value order (for `--aggregate-by`). Individual genomes not in any
+group are excluded unless named explicitly.
+
+**When neither `--group` nor `--aggregate-by` is specified:**
+`--select` can still reference genome labels for pure column projection (no
+aggregation). If `--select` is also absent, all genomes are output unchanged
+(identity transform — useful combined with row filtering via a prior `filter`
+run).
+
+---
+
+## Aggregation operators
+
+| Operator | Input       | Output   | Semantics |
+|----------|-------------|----------|-----------|
+| `any`    | pres / count | presence | 1 if ≥ 1 genome in group carries the k-mer |
+| `all`    | pres / count | presence | 1 if every genome in group carries the k-mer |
+| `none`   | pres / count | presence | 1 if no genome in group carries the k-mer |
+| `sum`    | count        | count    | sum of counts across the group |
+| `min`    | count        | count    | minimum count |
+| `max`    | count        | count    | maximum count |
+
+**Default operator:**
+- Presence index: `any`
+- Count index: `sum`
+
+Logical operators (`any`/`all`/`none`) on a count index use
+`--presence-threshold N` (default 0): a genome "carries" the k-mer if its count
+is > N.
+
+**Output index type:**
+- If the source is a presence index, the output is always a presence index.
+- If the source is a count index and every output column uses a logical operator
+  or is a pass-through from a presence source, the output is a presence index.
+- Otherwise (at least one arithmetic operator on a count source), the output is
+  a count index.
+
+---
+
+## Behaviour for edge cases
+
+| Situation | Behaviour |
+|-----------|-----------|
+| Genome missing the metadata key in `--aggregate-by` | genome ignored (no `NA` group) |
+| Genome in multiple groups | contributes independently to each |
+| `--group-op` references undefined group | error |
+| `--select` element is neither group name nor genome label | error |
+| `--output` and `--in-place` both specified | error |
+| Neither `--output` nor `--in-place` | error |
+| Group with zero matching genomes | column is all-zeros (or all-ones for `none`) |
+
+---
+
+## Examples
+
+### Aggregate by metadata group, default operators
+
+```sh
+obikmer select myindex --output out --aggregate-by group
+# one column per unique value of "group"; presence→any, count→sum
+```
+
+### Named groups with different operators
+
+```sh
+obikmer select myindex --output out \
+  --group    "pub:species=Betula_pubescens" \
+  --group    "nan:species=Betula_nana" \
+  --group-op "pub:any" \
+  --group-op "nan:all" \
+  --select   "pub,nan"
+```
+
+### Mix aggregated group and individual genome
+
+```sh
+obikmer select myindex --output out \
+  --group  "A:group=A" \
+  --select "A,Betula_nana--IGA-24-39"
+```
+
+### Pure column projection (no aggregation)
+
+```sh
+obikmer select myindex --output out \
+  --select "Betula_nana--TROM-V-149986,Betula_nana--AG-P04-25-01"
+```
+
+### In-place: keep only group A
+
+```sh
+obikmer select myindex --in-place --group "A:group=A" --select "A"
+```
+
+### Compose with filter
+
+```sh
+# Step 1: keep only B. nana-specific k-mers
+obikmer filter myindex --output filtered \
+  --ingroup "species=Betula_nana" --outgroup "*"
+
+# Step 2: aggregate genome columns by collection site
+obikmer select filtered --output final --aggregate-by site
+```
+
+---
+
+## Implementation notes
+
+`select` does not rebuild the MPHF. The 256 partitions are processed in parallel
+(rayon), each writing its output independently; results require no synchronisation
+because every partition owns a distinct set of files.
+
+For each layer in each partition:
+
+1. The slot count `n` is read by opening the source data matrix.
+2. A new data matrix is built with M columns (M = number of output columns).
+3. For each slot `s` in `0..n`:
+   - `old_row = matrix.fill_row(s)` — reads the original `N`-column row without allocating.
+   - For each output column `j`:
+     - `new_row[j] = aggregate(op, old_row[group_indices])`.
+     - Pass-through columns are represented as single-element groups with the
+       default operator (`any` for presence, `sum` for count) — same code path.
+   - The new row is written slot by slot into each column builder.
+4. All plain files in the source layer directory (`mphf.bin`, `unitigs.bin`,
+   evidence files, `layer_meta.json`) are copied verbatim; only the `presence/`
+   or `counts/` subdirectory is rewritten.
+5. `index.meta` is rewritten with the new genome list and updated `with_counts`.
+
+**`--in-place` write strategy:** new data is written to a temporary sibling
+directory (`presence_new/` or `counts_new/`); on success the old directory is
+removed and the temporary one is renamed into place. An interrupted run leaves
+at most one stale `*_new/` directory; the original data is intact until the
+rename step.
@@ -9,12 +9,13 @@
 | `superkmer` | Extract super-kmers from a sequence file and write to stdout |
 | `index`     | Build a complete genome index (scatter → dereplicate → count → layered MPHF) |
 | `merge`     | Merge multiple built indexes into one |
-| `rebuild`   | Filter and compact an existing index into a new single-layer index; supports ingroup/outgroup predicates on genome metadata |
+| `filter` | Apply row-level selection (σ) to an index: retain only k-mers matching the ingroup/outgroup predicates. Output is a new single-layer index — compaction is a consequence, not the goal. Supports the shared [kmer filtering](implementation/filtering.md) system |
 | `query`     | Query an index with sequences and annotate matches |
-| `dump`      | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the same ingroup/outgroup filtering as `rebuild` |
+| `dump`      | Dump all indexed k-mers as CSV (kmer + per-genome counts or presence); supports the shared [kmer filtering](implementation/filtering.md) system; `--head N` limits output to the first N k-mers |
 | `annotate`  | Add or update genome metadata from a CSV file; or dump metadata as CSV |
-| `distance`  | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees |
-| `unitig`    | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the same ingroup/outgroup filtering as `rebuild` |
+| `distance`  | Compute pairwise distance matrix between genomes; optionally build NJ/UPGMA trees; `--presence-threshold N` sets the minimum count to consider a k-mer present when computing Jaccard on count indexes (default 1) |
+| `unitig`    | Build a global de Bruijn graph across all partitions and enumerate its unitigs as FASTA; supports the shared [kmer filtering](implementation/filtering.md) system |
+| `select`    | Project and/or aggregate genome columns into a new or in-place index; the column-axis counterpart of `filter` (see [select](implementation/select.md)) |
 | `estimate`  | Estimate approximate-index parameters (z, evidence bits, FP rates) before indexing |
 | `reindex`   | Convert an index's evidence in-place: exact ↔ approx |
 | `utils`     | Miscellaneous index utilities: `--new-label NEW=OLD` renames a genome label; `--upgrade-index` adds missing `layer_meta.json` to old indexes |
@@ -0,0 +1,84 @@
+# Installation
+
+## Prerequisites
+
+### Rust toolchain
+
+`obikmer` requires **Rust 1.85 or later** (edition 2024). Install or update via [rustup](https://rustup.rs):
+
+```bash
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+rustup update stable
+```
+
+### C build environment (required for hwloc)
+
+`obikmer` embeds [hwloc](https://www.open-mpi.org/projects/hwloc/) (Hardware Locality) for NUMA-aware thread placement on multi-socket machines. hwloc is built from source at compile time via the `vendored` feature of the `hwlocality` crate. This requires a standard C build environment.
+
+#### Linux (Debian/Ubuntu)
+
+```bash
+apt install build-essential automake libtool autoconf pkg-config
+```
+
+#### Linux (RHEL/Rocky/AlmaLinux)
+
+```bash
+dnf install gcc make automake libtool autoconf pkgconfig
+```
+
+#### HPC clusters
+
+Most HPC clusters provide these tools via the module system:
+
+```bash
+module load gcc automake libtool autoconf
+```
+
+If in doubt, check whether `autoreconf --version` and `libtool --version` return successfully.
+
+#### macOS
+
+```bash
+brew install automake libtool autoconf pkg-config
+```
+
+## Building
+
+```bash
+git clone <repository-url>
+cd obikmer/src
+cargo build --release
+```
+
+The compiled binary is at `target/release/obikmer`.
+
+### Building on HPC clusters (network filesystems)
+
+HPC home directories are typically on a network filesystem (Lustre, NFS) optimised for large sequential reads — not for the thousands of small file operations that Cargo generates during compilation. Building directly on such a filesystem can be extremely slow (0.1% CPU utilisation, tens of minutes for what should take seconds).
+
+**Always redirect the build directory to a local scratch disk:**
+
+```bash
+CARGO_TARGET_DIR=/scratch/$USER/cargo-target cargo build --release
+```
+
+Adapt the path to the local scratch available on your cluster (`/var/tmp`, `/tmp`, `/scratch/local`, etc.). Once built, copy the binary to a permanent location:
+
+```bash
+cp /scratch/$USER/cargo-target/release/obikmer ~/bin/
+```
+
+## NUMA support
+
+NUMA-aware thread placement is active automatically on multi-socket Linux machines (detected at runtime via hwloc). No special build flag is required — the detection is built in and falls back gracefully to the single-pool adaptive strategy on:
+
+- macOS (Apple Silicon, unified memory)
+- single-socket Linux machines
+- any system where hwloc reports only one NUMA node
+
+## Verifying the installation
+
+```bash
+obikmer --help
+```
@@ -2,3 +2,4 @@

 - [Project domain](project_domain.md) — obikmer est pour la génomique (génomes individuels), pas la métagénomique
 - [No architectural decisions without authorization](feedback_architectural_decisions.md) — toute décision architecturale (mémoire, algo, structure) requiert l'accord explicite de l'utilisateur avant toute action
+- [Phases intra-partition parallèles](feedback_phases_parallelism.md) — graph build, compute_degrees, unitig traversal, MPHF utilisent Rayon — ne jamais les appeler "séquentielles"
@@ -0,0 +1,12 @@
+---
+name: feedback-phases-parallelism
+description: Les phases intra-partition (graph build, compute_degrees, unitig traversal, MPHF) utilisent toutes Rayon — elles ne sont PAS séquentielles
+metadata:
+  type: feedback
+---
+
+Ne jamais qualifier les phases intra-partition de "séquentielles". Chaque phase (graph build, compute_degrees, unitig traversal, MPHF build) utilise Rayon en interne et s'exécute en parallèle sur plusieurs cœurs.
+
+**Why:** L'utilisateur a corrigé ce point plusieurs fois. Le décrire comme "séquentiel" est une erreur factuelle qui fausse l'analyse de performance.
+
+**How to apply:** Quand on analyse l'efficacité CPU ou les 25% manquants, chercher la cause dans le déséquilibre de charge entre partitions, la contention Rayon entre workers, ou la latence inter-partitions — pas dans une prétendue sérialisation des phases.
@@ -29,6 +29,7 @@ extra_javascript:

 nav:
  - Home: index.md
+  - Installation: installation.md
  - Theory:
      - Kmers and super-kmers: kmers.md
      - DNA encoding: theory/encoding.md
@@ -49,10 +50,15 @@ nav:
      - PersistentCompactIntVec: implementation/persistent_compact_int_vec.md
      - PersistentBitVec: implementation/persistent_bit_vec.md
      - Merge command: implementation/merge.md
-      - Kmer filtering (rebuild/dump/unitig): implementation/rebuild_filter.md
+      - Merge parallelism & memory: implementation/merge_parallelism.md
+      - Kmer filtering: implementation/filtering.md
+      - Select command: implementation/select.md
+      - obitaxonomy crate: implementation/obitaxonomy.md
  - Architecture:
      - Sequences: architecture/sequences/invariant.md
      - Kmer index: architecture/index_architecture.md
+      - NUMA-aware worker pools: architecture/numa_worker_pools.md
+      - NUMA-aware partition runner: architecture/numa_partition_runner.md

 watch:
  - docmd
@@ -0,0 +1,44 @@
+# La crate obicompactvector
+
+Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE.
+
+La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice. 
+
+Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap`
+
+Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable. 
+
+Les matrices peuvent êtres représenté de deux façons:
+    - via un répertoire contenant une collection de fichier colonnes
+    - via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes.
+
+
+## Les matrices de comptage 
+
+Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32`
+
+## Les matrices de presence
+
+Ce sont des matrices de boolean représenté comme des champs de bits
+
+Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies
+
+## représentation légère des colonnes
+
+Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes.
+
+### Représentation légère d'un vecteur de présence
+
+Le vecteur est représenté par 
+    - un champs de bits encodé comme un [u64]
+    - un usize encodant la longeur du champs de bits
+    
+###  Représentation légère d'un vecteur de présence
+
+Le vecteur est représenté par 
+    - un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[
+      La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255
+      et se trouvent dans une structure d'overflow
+    - un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs
+      sentinels (255) du [u8]
+    - un usize encodant la longeur du champs de bits
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""Parse obikmer merge debug log → Markdown performance report."""
+
+import re
+import sys
+from datetime import datetime
+from collections import defaultdict
+from statistics import mean, median, stdev
+
+ANSI = re.compile(r'\x1b\[[0-9;]*m')
+
+def strip(s):
+    return ANSI.sub('', s)
+
+def parse_ts(s):
+    return datetime.fromisoformat(s.replace('Z', '+00:00'))
+
+def dur_s(s):
+    s = s.strip()
+    if s.endswith('ms'):   return float(s[:-2]) / 1e3
+    if s.endswith('µs'):   return float(s[:-2]) / 1e6
+    if s.endswith('us'):   return float(s[:-2]) / 1e6
+    if s.endswith('ns'):   return float(s[:-2]) / 1e9
+    if s.endswith('s'):    return float(s[:-1])
+    return float(s)
+
+def fmt_s(s):
+    if s < 0.001:  return f"{s*1e6:.0f}µs"
+    if s < 1:      return f"{s*1e3:.0f}ms"
+    if s < 60:     return f"{s:.2f}s"
+    return f"{s/60:.1f}min ({s:.0f}s)"
+
+def fmt_rate(n, s):
+    if s <= 0: return "—"
+    r = n / s
+    if r >= 1e9: return f"{r/1e9:.2f}G/s"
+    if r >= 1e6: return f"{r/1e6:.2f}M/s"
+    if r >= 1e3: return f"{r/1e3:.2f}K/s"
+    return f"{r:.0f}/s"
+
+def pct(a, b):
+    return f"{100*a/b:.1f}%" if b else "—"
+
+def stats_row(label, values, unit="s", fmt=fmt_s):
+    if not values: return f"| {label} | — | — | — | — | — |"
+    mn, mx, med, av = min(values), max(values), median(values), mean(values)
+    sd = stdev(values) if len(values) > 1 else 0
+    return f"| {label} | {fmt(mn)} | {fmt(med)} | {fmt(av)} | {fmt(mx)} | {fmt(sd)} |"
+
+# ── patterns ──────────────────────────────────────────────────────────────────
+
+TS = r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z)'
+
+pats = {
+    'graph_done':    re.compile(TS + r'.*partition (\d+): de Bruijn graph done — (\d+) new kmers'),
+    'trav_start':    re.compile(TS + r'.*partition (\d+): unitig traversal start — (\d+) nodes'),
+    'trav_closing':  re.compile(TS + r'.*partition (\d+): unitig writer closing'),
+    'trav_closed':   re.compile(TS + r'.*partition (\d+): unitig writer closed'),
+    'graph_dropped': re.compile(TS + r'.*partition (\d+): graph dropped — starting MPHF build \((\d+) unitigs\)'),
+    'mphf_done':     re.compile(TS + r'.*partition (\d+): MPHF build done'),
+    'mphf_open':     re.compile(TS + r'.*partition (\d+): MPHF open in ([\d.]+)s'),
+    'bld_ready':     re.compile(TS + r'.*partition (\d+): builders ready in ([\d.]+)s'),
+    'pass2_done':    re.compile(TS + r'.*partition (\d+): pass2 pipeline done in ([\d.]+)s'),
+    'bld_closed':    re.compile(TS + r'.*partition (\d+): builders closed in ([\d.]+)s'),
+    'part_done':     re.compile(TS + r'.*partition (\d+): done in ([\d.]+)s — (\d+) new kmers'),
+    'worker':        re.compile(TS + r'.*activated worker (\d+).*efficiency (\d+)%.*gain vs prev (\d+)%'),
+    'worker_poll':   re.compile(TS + r'.*activated worker (\d+) \(poll\).*efficiency (\d+)%'),
+    'compute_deg':   re.compile(TS + r'.*partition (\d+): compute_degrees in ([\d.]+)s — (\d+) nodes'),
+    'stage_done':    re.compile(TS + r'.*done stage=merge_partitions wall_secs=([\d.]+)'),
+    'workers_rep':   re.compile(r'workers spawned: (\d+) / (\d+)'),
+}
+
+# ── parse ─────────────────────────────────────────────────────────────────────
+
+P = defaultdict(dict)   # partition_id → timing dict
+workers_ev = []
+wall_total = None
+workers_final = (None, None)
+
+with open(sys.argv[1]) as f:
+    for raw in f:
+        line = strip(raw)
+
+        m = pats['graph_done'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['n_kmers'] = int(m.group(3))
+            P[pid]['graph_done_ts'] = parse_ts(m.group(1))
+            continue
+
+        m = pats['trav_start'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['trav_start_ts'] = parse_ts(m.group(1))
+            P[pid]['n_nodes'] = int(m.group(3))
+            continue
+
+        m = pats['trav_closing'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['trav_closing_ts'] = parse_ts(m.group(1))
+            continue
+
+        m = pats['trav_closed'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['trav_closed_ts'] = parse_ts(m.group(1))
+            continue
+
+        m = pats['graph_dropped'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['drop_ts'] = parse_ts(m.group(1))
+            P[pid]['n_unitigs'] = int(m.group(3))
+            continue
+
+        m = pats['mphf_done'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['mphf_done_ts'] = parse_ts(m.group(1))
+            continue
+
+        m = pats['mphf_open'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['mphf_open_s'] = float(m.group(3))
+            continue
+
+        m = pats['bld_ready'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['bld_ready_s'] = float(m.group(3))
+            continue
+
+        m = pats['pass2_done'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['pass2_s'] = float(m.group(3))
+            continue
+
+        m = pats['bld_closed'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['bld_closed_s'] = float(m.group(3))
+            continue
+
+        m = pats['part_done'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['total_s'] = float(m.group(3))
+            P[pid]['done_ts'] = parse_ts(m.group(1))
+            continue
+
+        m = pats['worker'].search(line)
+        if m:
+            workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)),
+                                'gain': int(m.group(4)), 'ts': parse_ts(m.group(1)), 'poll': False})
+            continue
+
+        m = pats['worker_poll'].search(line)
+        if m:
+            workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)),
+                                'gain': None, 'ts': parse_ts(m.group(1)), 'poll': True})
+            continue
+
+        m = pats['compute_deg'].search(line)
+        if m:
+            pid = int(m.group(2))
+            P[pid]['cdeg_s'] = float(m.group(3))
+            P[pid]['n_nodes'] = P[pid].get('n_nodes') or int(m.group(4))
+            continue
+
+        m = pats['stage_done'].search(line)
+        if m:
+            wall_total = float(m.group(2))
+            continue
+
+        m = pats['workers_rep'].search(line)
+        if m:
+            workers_final = (int(m.group(1)), int(m.group(2)))
+            continue
+
+# ── derive per-partition phases ───────────────────────────────────────────────
+
+def tsdiff(p, k1, k2):
+    if k1 in p and k2 in p:
+        return (p[k2] - p[k1]).total_seconds()
+    return None
+
+phases = {}
+for pid, p in P.items():
+    row = {'pid': pid}
+    row['n_kmers']  = p.get('n_kmers', 0)
+    row['n_nodes']  = p.get('n_nodes', 0)
+    row['n_unitigs']= p.get('n_unitigs', 0)
+    row['total_s']  = p.get('total_s')
+    row['cdeg_s']       = p.get('cdeg_s')
+    row['mphf_open_s']  = p.get('mphf_open_s')
+    row['bld_ready_s']  = p.get('bld_ready_s')
+    row['pass2_s']      = p.get('pass2_s')
+    row['bld_closed_s'] = p.get('bld_closed_s')
+
+    # Traversal: trav_start → trav_closing (= writing all unitigs)
+    row['trav_s']   = tsdiff(p, 'trav_start_ts', 'trav_closing_ts')
+    # Writer close: trav_closing → trav_closed
+    row['close_s']  = tsdiff(p, 'trav_closing_ts', 'trav_closed_ts')
+    # Graph drop: trav_closed → drop_ts
+    row['drop_s']   = tsdiff(p, 'trav_closed_ts', 'drop_ts')
+    # MPHF build: drop_ts → mphf_done_ts
+    row['mphf_s']   = tsdiff(p, 'drop_ts', 'mphf_done_ts')
+    # After MPHF: mphf_done → done_ts
+    row['post_s']   = tsdiff(p, 'mphf_done_ts', 'done_ts')
+
+    # Graph build: total - known phases (rough estimate)
+    known = sum(v for v in [row['cdeg_s'], row['trav_s'], row['close_s'], row['drop_s'],
+                             row['mphf_s'], row['mphf_open_s'], row['bld_ready_s'],
+                             row['pass2_s'], row['bld_closed_s']] if v is not None)
+    row['graph_build_s'] = (row['total_s'] - known) if row['total_s'] else None
+
+    phases[pid] = row
+
+# helpers
+def collect(key):
+    return [r[key] for r in phases.values() if r.get(key) is not None]
+
+def rate_stats(n_key, t_key):
+    """Returns list of throughput values (items/s)."""
+    result = []
+    for r in phases.values():
+        n, t = r.get(n_key), r.get(t_key)
+        if n and t and t > 0:
+            result.append(n / t)
+    return result
+
+# ── output ────────────────────────────────────────────────────────────────────
+
+out = []
+w = out.append
+
+w("# obikmer merge — performance report\n")
+
+# Run info
+n_parts = len([r for r in phases.values() if r['n_kmers'] > 0])
+n_empty = len([r for r in phases.values() if r['n_kmers'] == 0])
+total_kmers = sum(r['n_kmers'] for r in phases.values())
+w("## Run summary\n")
+w(f"- **Partitions**: {len(phases)} total — {n_parts} non-empty, {n_empty} empty")
+w(f"- **New kmers (total)**: {total_kmers:,}")
+if wall_total:
+    w(f"- **merge_partitions wall time**: {fmt_s(wall_total)}")
+if workers_final[0]:
+    w(f"- **Workers spawned**: {workers_final[0]} / {workers_final[1]} (max)")
+w("")
+
+# Worker spawn timeline
+if workers_ev:
+    w("## Worker activation\n")
+    w("| Time | Worker # | Trigger | Efficiency | Gain vs prev |")
+    w("|------|----------|---------|------------|--------------|")
+    t0 = workers_ev[0]['ts']
+    for e in workers_ev:
+        elapsed = fmt_s((e['ts'] - t0).total_seconds())
+        trigger = "poll (timeout)" if e['poll'] else "partition done"
+        gain = f"{e['gain']}%" if e.get('gain') is not None else "—"
+        w(f"| +{elapsed} | {e['n']} | {trigger} | {e['eff']}% | {gain} |")
+    w("")
+
+# Phase breakdown table
+w("## Phase timing statistics\n")
+w("Columns: min | median | mean | max | stdev\n")
+w("| Phase | min | median | mean | max | stdev |")
+w("|-------|-----|--------|------|-----|-------|")
+w(stats_row("Graph build (estimated)", collect('graph_build_s')))
+w(stats_row("compute_degrees", collect('cdeg_s')))
+w(stats_row("Unitig traversal", collect('trav_s')))
+w(stats_row("Writer close (uw.close)", collect('close_s')))
+w(stats_row("Graph drop", collect('drop_s')))
+w(stats_row("MPHF build", collect('mphf_s')))
+w(stats_row("MPHF open", collect('mphf_open_s')))
+w(stats_row("Builders ready", collect('bld_ready_s')))
+w(stats_row("Pass2 pipeline", collect('pass2_s')))
+w(stats_row("Builders close", collect('bld_closed_s')))
+w(stats_row("Post-MPHF (residual)", collect('post_s')))
+w(stats_row("**Total per partition**", collect('total_s')))
+w("")
+
+# Throughput
+w("## Throughput by phase\n")
+w("| Phase | metric | min | median | mean | max |")
+w("|-------|--------|-----|--------|------|-----|")
+
+def rate_row(label, rates):
+    if not rates: return f"| {label} | — | — | — | — | — |"
+    f = lambda x: fmt_rate(x, 1)
+    mn, med, av, mx = min(rates), median(rates), mean(rates), max(rates)
+    return f"| {label} | nodes/s | {f(mn)} | {f(med)} | {f(av)} | {f(mx)} |"
+
+w(rate_row("compute_degrees", rate_stats('n_nodes', 'cdeg_s')))
+w(rate_row("Unitig traversal", rate_stats('n_nodes', 'trav_s')))
+w(rate_row("MPHF build", rate_stats('n_unitigs', 'mphf_s')))
+w("")
+
+
+# Top 10 slowest partitions
+w("## Top 10 slowest partitions\n")
+w("| Partition | nodes | unitigs | total | trav | MPHF | graph build |")
+w("|-----------|-------|---------|-------|------|------|-------------|")
+sorted_parts = sorted(phases.values(), key=lambda r: r['total_s'] or 0, reverse=True)
+for r in sorted_parts[:10]:
+    pid = r['pid']
+    def f(k): return fmt_s(r[k]) if r.get(k) is not None else "—"
+    nodes = f"{r['n_nodes']/1e6:.1f}M" if r['n_nodes'] else "—"
+    unitigs = f"{r['n_unitigs']/1e6:.1f}M" if r['n_unitigs'] else "—"
+    w(f"| {pid} | {nodes} | {unitigs} | {f('total_s')} | {f('trav_s')} | {f('mphf_s')} | {f('graph_build_s')} |")
+w("")
+
+# Phase share of total time (for non-empty partitions with full data)
+complete = [r for r in phases.values()
+            if all(r.get(k) is not None
+                   for k in ('total_s','trav_s','close_s','drop_s','mphf_s',
+                              'mphf_open_s','bld_ready_s','pass2_s','bld_closed_s'))
+            and r['total_s'] and r['total_s'] > 0]
+if complete:
+    w("## Phase share of total time (mean across complete partitions)\n")
+    total_mean = mean(r['total_s'] for r in complete)
+    w(f"_Based on {len(complete)} partitions with full timing data. Mean total: {fmt_s(total_mean)}_\n")
+    w("| Phase | mean time | share |")
+    w("|-------|-----------|-------|")
+    for label, key in [
+        ("Graph build", 'graph_build_s'),
+        ("compute_degrees", 'cdeg_s'),
+        ("Unitig traversal", 'trav_s'),
+        ("Writer close", 'close_s'),
+        ("Graph drop", 'drop_s'),
+        ("MPHF build", 'mphf_s'),
+        ("MPHF open", 'mphf_open_s'),
+        ("Builders ready", 'bld_ready_s'),
+        ("Pass2 pipeline", 'pass2_s'),
+        ("Builders close", 'bld_closed_s'),
+        ("Post-MPHF (residual)", 'post_s'),
+    ]:
+        vals = [r[key] for r in complete]
+        m = mean(vals)
+        w(f"| {label} | {fmt_s(m)} | {pct(m, total_mean)} |")
+    w("")
+
+print('\n'.join(out))
@@ -128,6 +128,12 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2ad8689a486416c401ea15715a4694de30054248ec627edbf31f49cb64ee4086"

+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
 [[package]]
 name = "as-slice"
 version = "0.2.1"
@@ -143,6 +149,15 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"

+[[package]]
+name = "autotools"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef941527c41b0fc0dd48511a8154cd5fc7e29200a0ff8b7203c5d777dbc795cf"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.76"
@@ -224,6 +239,15 @@ dependencies = [
 "generic-array",
 ]

+[[package]]
+name = "block-buffer"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa"
+dependencies = [
+ "hybrid-array",
+]
+
 [[package]]
 name = "block-pseudorand"
 version = "0.1.2"
@@ -415,6 +439,15 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"

+[[package]]
+name = "cmake"
+version = "0.1.58"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.5"
@@ -464,6 +497,21 @@ dependencies = [
 "windows-sys 0.59.0",
 ]

+[[package]]
+name = "const-oid"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c"
+
+[[package]]
+name = "convert_case"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "633458d4ef8c78b72454de2d54fd6ab2e60f9e02be22f3c6104cdc8a4e0fceb9"
+dependencies = [
+ "unicode-segmentation",
+]
+
 [[package]]
 name = "core-foundation-sys"
 version = "0.8.7"
@@ -488,6 +536,15 @@ dependencies = [
 "libc",
 ]

+[[package]]
+name = "cpufeatures"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "crc32fast"
 version = "1.5.0"
@@ -601,6 +658,15 @@ dependencies = [
 "typenum",
 ]

+[[package]]
+name = "crypto-common"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453"
+dependencies = [
+ "hybrid-array",
+]
+
 [[package]]
 name = "csv"
 version = "1.4.0"
@@ -640,14 +706,48 @@ dependencies = [
 "uuid",
 ]

+[[package]]
+name = "derive_more"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
+dependencies = [
+ "derive_more-impl",
+]
+
+[[package]]
+name = "derive_more-impl"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
+dependencies = [
+ "convert_case",
+ "proc-macro2",
+ "quote",
+ "rustc_version",
+ "syn",
+ "unicode-xid",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
 dependencies = [
- "block-buffer",
- "crypto-common",
+ "block-buffer 0.10.4",
+ "crypto-common 0.1.7",
+]
+
+[[package]]
+name = "digest"
+version = "0.11.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2"
+dependencies = [
+ "block-buffer 0.12.1",
+ "const-oid",
+ "crypto-common 0.2.2",
 ]

 [[package]]
@@ -742,6 +842,16 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"

+[[package]]
+name = "filetime"
+version = "0.2.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759"
+dependencies = [
+ "cfg-if",
+ "libc",
+]
+
 [[package]]
 name = "find-msvc-tools"
 version = "0.1.9"
@@ -916,6 +1026,65 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"

+[[package]]
+name = "http"
+version = "1.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425"
+dependencies = [
+ "bytes",
+ "itoa",
+]
+
+[[package]]
+name = "httparse"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
+
+[[package]]
+name = "hwlocality"
+version = "1.0.0-alpha.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c2e65a48d3b300843ac84a2fe8e166bb5a5b00f30054593bcee8157e4b465fd"
+dependencies = [
+ "arrayvec",
+ "bitflags 2.11.1",
+ "derive_more",
+ "errno",
+ "hwlocality-sys",
+ "libc",
+ "strum",
+ "thiserror 2.0.18",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "hwlocality-sys"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10a83c43a772c1f774b806deb44891c2a9578eb33cec48aad513482e0da3d4d4"
+dependencies = [
+ "autotools",
+ "cmake",
+ "flate2",
+ "libc",
+ "pkg-config",
+ "sha3",
+ "tar",
+ "ureq 3.3.0",
+ "windows-sys 0.61.2",
+]
+
+[[package]]
+name = "hybrid-array"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da"
+dependencies = [
+ "typenum",
+]
+
 [[package]]
 name = "icu_collections"
 version = "2.2.0"
@@ -1145,6 +1314,16 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "keccak"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e24a010dd405bd7ed803e5253182815b41bf2e6a80cc3bfc066658e03a198aa"
+dependencies = [
+ "cfg-if",
+ "cpufeatures 0.3.0",
+]
+
 [[package]]
 name = "kodama"
 version = "0.2.3"
@@ -1486,6 +1665,7 @@ name = "obidebruinj"
 version = "0.1.0"
 dependencies = [
 "ahash",
+ "crossbeam-channel",
 "hashbrown 0.14.5",
 "obifastwrite",
 "obikseq",
@@ -1506,6 +1686,8 @@ dependencies = [
 name = "obikindex"
 version = "0.1.0"
 dependencies = [
+ "crossbeam-channel",
+ "hwlocality",
 "indicatif",
 "ndarray",
 "obicompactvec",
@@ -1522,7 +1704,7 @@ dependencies = [

 [[package]]
 name = "obikmer"
-version = "0.1.0"
+version = "1.1.13"
 dependencies = [
 "clap",
 "csv",
@@ -1540,6 +1722,7 @@ dependencies = [
 "obiskbuilder",
 "obiskio",
 "obisys",
+ "obitaxonomy",
 "pprof",
 "rayon",
 "serde_json",
@@ -1562,6 +1745,7 @@ dependencies = [
 "obikrope",
 "obikseq",
 "obilayeredmap",
+ "obipipeline",
 "obiread",
 "obiskbuilder",
 "obiskio",
@@ -1633,7 +1817,7 @@ dependencies = [
 "regex",
 "tracing",
 "tracing-subscriber",
- "ureq",
+ "ureq 2.12.1",
 ]

 [[package]]
@@ -1670,6 +1854,10 @@ dependencies = [
 "tracing",
 ]

+[[package]]
+name = "obitaxonomy"
+version = "0.1.0"
+
 [[package]]
 name = "object"
 version = "0.37.3"
@@ -2174,6 +2362,15 @@ version = "2.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"

+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
 [[package]]
 name = "rustix"
 version = "1.1.4"
@@ -2260,6 +2457,12 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
 [[package]]
 name = "serde"
 version = "1.0.228"
@@ -2310,8 +2513,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
 dependencies = [
 "cfg-if",
- "cpufeatures",
- "digest",
+ "cpufeatures 0.2.17",
+ "digest 0.10.7",
+]
+
+[[package]]
+name = "sha3"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be176f1a57ce4e3d31c1a166222d9768de5954f811601fb7ca06fc8203905ce1"
+dependencies = [
+ "digest 0.11.3",
+ "keccak",
 ]

 [[package]]
@@ -2372,6 +2585,27 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"

+[[package]]
+name = "strum"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9628de9b8791db39ceda2b119bbe13134770b56c138ec1d3af810d045c04f9bd"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.28.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab85eea0270ee17587ed4156089e10b9e6880ee688791d45a905f5b1ca36f664"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@@ -2467,6 +2701,17 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"

+[[package]]
+name = "tar"
+version = "0.4.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840"
+dependencies = [
+ "filetime",
+ "libc",
+ "xattr",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.27.0"
@@ -2642,12 +2887,24 @@ version = "1.0.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"

+[[package]]
+name = "unicode-segmentation"
+version = "1.13.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8"
+
 [[package]]
 name = "unicode-width"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"

+[[package]]
+name = "unicode-xid"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
+
 [[package]]
 name = "untrusted"
 version = "0.9.0"
@@ -2670,6 +2927,35 @@ dependencies = [
 "webpki-roots 0.26.11",
 ]

+[[package]]
+name = "ureq"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0"
+dependencies = [
+ "base64",
+ "flate2",
+ "log",
+ "percent-encoding",
+ "rustls",
+ "rustls-pki-types",
+ "ureq-proto",
+ "utf8-zero",
+ "webpki-roots 1.0.7",
+]
+
+[[package]]
+name = "ureq-proto"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c"
+dependencies = [
+ "base64",
+ "http",
+ "httparse",
+ "log",
+]
+
 [[package]]
 name = "url"
 version = "2.5.8"
@@ -2682,6 +2968,12 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "utf8-zero"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e"
+
 [[package]]
 name = "utf8_iter"
 version = "1.0.4"
@@ -3107,6 +3399,16 @@ dependencies = [
 "tap",
 ]

+[[package]]
+name = "xattr"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156"
+dependencies = [
+ "libc",
+ "rustix",
+]
+
 [[package]]
 name = "xxhash-rust"
 version = "0.8.15"
@@ -1,5 +1,5 @@
 [workspace]
 resolver = "3"
-members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
+members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
 [profile.release]
 debug = 1
@@ -7,6 +7,6 @@ edition = "2024"
 memmap2  = "0.9"
 ndarray  = "0.16"
 rayon    = "1"
+tempfile = "3"

 [dev-dependencies]
-tempfile = "3"
@@ -1,5 +1,5 @@
 use std::fs::{self, File};
-use std::io::{self, Write as _};
+use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};

 use memmap2::Mmap;
@@ -7,8 +7,12 @@ use ndarray::{Array1, Array2};
 use rayon::prelude::*;

 use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
+use crate::colgroup::{ColGroup, MatrixGroupOps};
 use crate::layer_meta::LayerMeta;
 use crate::meta::MatrixMeta;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
+use crate::views::BitSliceView;

 fn col_path(dir: &Path, col: usize) -> PathBuf {
    dir.join(format!("col_{col:06}.pbiv"))
@@ -54,34 +58,11 @@ impl ColumnarBitMatrix {
    }

    pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| {
-                let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j));
-                (i, j, inter, union)
-            })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j)))
    }

    pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j)))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| (i, j, f(i, j)))
-            .collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j)))
    }

    pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> {
@@ -147,113 +128,92 @@ impl PackedBitMatrix {
        }).collect()
    }

-    #[inline]
    fn col_bytes(&self, c: usize) -> &[u8] {
        let start = self.data_offsets[c];
-        let len = (self.n_rows + 7) / 8;
-        &self.mmap[start..start + len]
+        &self.mmap[start..start + self.n_rows.div_ceil(8)]
    }

-    fn count_ones_col(&self, c: usize) -> u64 {
-        let bytes = self.col_bytes(c);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let mut n: u64 = bytes[..full].iter().map(|b| b.count_ones() as u64).sum();
-        if rem > 0 { n += (bytes[full] & ((1u8 << rem) - 1)).count_ones() as u64; }
-        n
+    fn col_words(&self, c: usize) -> &[u64] {
+        let nw = self.n_rows.div_ceil(64);
+        // SAFETY: data_offsets[c] is always 8-byte aligned.
+        // PBMX header = 24 + n_cols×8 (multiple of 8); each PBIV blob =
+        // 16 + nwords×8 (multiple of 8); mmap base is page-aligned.
+        let ptr = self.mmap[self.data_offsets[c]..].as_ptr() as *const u64;
+        unsafe { std::slice::from_raw_parts(ptr, nw) }
    }

-    fn pair_op(&self, i: usize, j: usize, and_or: bool) -> u64 {
-        let ai = self.col_bytes(i);
-        let aj = self.col_bytes(j);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let mut n: u64 = ai[..full].iter().zip(aj[..full].iter())
-            .map(|(a, b)| if and_or { a & b } else { a ^ b }.count_ones() as u64)
-            .sum();
-        if rem > 0 {
-            let mask = (1u8 << rem) - 1;
-            let last = if and_or { ai[full] & aj[full] } else { ai[full] ^ aj[full] };
-            n += (last & mask).count_ones() as u64;
-        }
-        n
+    pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> {
+        BitSliceView::new(self.col_words(c), self.n_rows)
    }

-    fn partial_jaccard_col(&self, i: usize, j: usize) -> (u64, u64) {
-        let ai = self.col_bytes(i);
-        let aj = self.col_bytes(j);
-        let full = self.n_rows / 8;
-        let rem  = self.n_rows % 8;
-        let (mut inter, mut union) = ai[..full].iter().zip(aj[..full].iter())
-            .fold((0u64, 0u64), |(inter, union), (a, b)| {
-                (inter + (a & b).count_ones() as u64,
-                 union + (a | b).count_ones() as u64)
-            });
-        if rem > 0 {
-            let mask = (1u8 << rem) - 1;
-            inter += ((ai[full] & aj[full]) & mask).count_ones() as u64;
-            union += ((ai[full] | aj[full]) & mask).count_ones() as u64;
-        }
-        (inter, union)
+    pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
+        PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path)
    }

    pub(crate) fn count_ones(&self) -> Array1<u64> {
        Array1::from_vec(
-            (0..self.n_cols).into_par_iter().map(|c| self.count_ones_col(c)).collect()
+            (0..self.n_cols).into_par_iter()
+                .map(|c| self.col_slice(c).count_ones())
+                .collect()
        )
    }

    pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols, |i, j| {
+            self.col_slice(i).partial_jaccard_dist(self.col_slice(j))
+        })
    }

    pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2<u64> {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| (i, j, self.pair_op(i, j, false)))
-            .collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+        pairwise_matrix(self.n_cols, |i, j| {
+            self.col_slice(i).hamming_dist(self.col_slice(j))
+        })
    }
 }

 /// Build `presence/matrix.pbmx` from existing `col_*.pbiv` files.
 pub fn pack_bit_matrix(dir: &Path) -> io::Result<()> {
+    let packed_path = dir.join("matrix.pbmx");
+    if packed_path.exists() {
+        // Matrix complete; remove any leftover column files from a killed cleanup.
+        if let Ok(meta) = MatrixMeta::load(dir) {
+            for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
+            let _ = fs::remove_file(dir.join("meta.json"));
+        }
+        return Ok(());
+    }
+
    let meta = MatrixMeta::load(dir)?;
    let n_cols = meta.n_cols;

-    let col_files: Vec<Vec<u8>> = (0..n_cols)
-        .map(|c| fs::read(col_path(dir, c)))
+    // Compute offsets from file sizes — no column data loaded into RAM.
+    let col_sizes: Vec<u64> = (0..n_cols)
+        .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
        .collect::<io::Result<_>>()?;

-    let header_size = PBMX_HEADER + n_cols * 8;
+    let header_size = (PBMX_HEADER + n_cols * 8) as u64;
    let mut col_offset = header_size;
    let mut offsets = Vec::with_capacity(n_cols);
-    for data in &col_files {
-        offsets.push(col_offset as u64);
-        col_offset += data.len();
+    for &size in &col_sizes {
+        offsets.push(col_offset);
+        col_offset += size;
    }

-    let packed_path = dir.join("matrix.pbmx");
-    let mut file = File::create(&packed_path)?;
-    file.write_all(&PBMX_MAGIC)?;
-    file.write_all(&[0u8; 4])?;
-    file.write_all(&(meta.n as u64).to_le_bytes())?;
-    file.write_all(&(n_cols as u64).to_le_bytes())?;
-    for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
-    for data in &col_files { file.write_all(data)?; }
-    drop(file);
+    // Write to a temp file; rename atomically so a killed process never leaves
+    // a truncated matrix.pbmx that would be mistaken for a complete file.
+    let tmp_path = dir.join("matrix.pbmx.tmp");
+    let mut out = BufWriter::new(File::create(&tmp_path)?);
+    out.write_all(&PBMX_MAGIC)?;
+    out.write_all(&[0u8; 4])?;
+    out.write_all(&(meta.n as u64).to_le_bytes())?;
+    out.write_all(&(n_cols as u64).to_le_bytes())?;
+    for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
+    for c in 0..n_cols {
+        io::copy(&mut File::open(col_path(dir, c))?, &mut out)?;
+    }
+    out.flush()?;
+    drop(out);
+    fs::rename(&tmp_path, &packed_path)?;

    for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
    fs::remove_file(dir.join("meta.json"))?;
@@ -326,6 +286,24 @@ impl PersistentBitMatrix {
        }
    }

+    pub fn col_view(&self, c: usize) -> BitSliceView<'_> {
+        match self {
+            Self::Columnar(m) => m.col(c).view(),
+            Self::Packed(m)   => m.col_slice(c),
+            Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"),
+        }
+    }
+
+    pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentBitVecBuilder> {
+        match self {
+            Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path),
+            Self::Packed(m)   => m.col_persist(c, path),
+            Self::Implicit { n_rows, .. } => {
+                PersistentBitVecBuilder::new_ones(*n_rows, path)
+            }
+        }
+    }
+
    pub fn row(&self, slot: usize) -> Box<[bool]> {
        match self {
            Self::Columnar(m)             => m.row(slot),
@@ -422,12 +400,93 @@ impl PersistentBitMatrixBuilder {
        PersistentBitVecBuilder::new(self.n, &path)
    }

+    pub fn add_col_ones(&mut self) -> io::Result<PersistentBitVecBuilder> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        PersistentBitVecBuilder::new_ones(self.n, &path)
+    }
+
+    pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> {
+        src.make_persistent(&col_path(&self.dir, self.n_cols))?;
+        self.n_cols += 1;
+        Ok(())
+    }
+
+    pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        let mut b = PersistentBitVecBuilder::new(self.n, &path)?;
+        b.or_where(src.view(), |v| v > 0);
+        b.close()
+    }
+
    pub fn close(self) -> io::Result<()> {
        MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
    }
 }

-// ── Helpers ───────────────────────────────────────────────────────────────────
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────
+
+impl MatrixGroupOps for PersistentBitMatrix {
+    fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempCompactIntVec> {
+        // Bit matrices store 0/1 — threshold is structurally always 1.
+        let n = self.n();
+        if g.indices.len() < 255 {
+            let mut builder = TempCompactIntVecBuilder::new(n)?;
+            for &c in &g.indices {
+                builder.inc_present_fast(self.col_view(c));
+            }
+            builder.freeze()
+        } else {
+            let mut result = TempCompactIntVecBuilder::new(n)?;
+            for chunk in g.indices.chunks(254) {
+                let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
+                for &c in chunk {
+                    chunk_b.inc_present_fast(self.col_view(c));
+                }
+                let frozen = chunk_b.freeze()?;
+                result.add(frozen.view());
+            }
+            result.freeze()
+        }
+    }
+
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        // For bit matrices, sum = count of 1-bits — identical to presence_count.
+        self.partial_group_presence_count(g, 1)
+    }
+
+    fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result<TempBitVec> {
+        let n = self.n();
+        let mut result = TempBitVecBuilder::new(n)?;
+        for &c in &g.indices {
+            result.or(self.col_view(c));
+        }
+        result.freeze()
+    }
+
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        // min of 0/1 values = AND: 1 only if ALL columns are 1
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        if let Some((&first, rest)) = g.indices.split_first() {
+            result.inc_present_fast(self.col_view(first));
+            for &c in rest { result.mask_with(self.col_view(c)); }
+        }
+        result.freeze()
+    }
+
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        // max of 0/1 values = OR: 1 if any column is 1
+        let any = self.partial_group_any(g, 1)?;
+        let n = any.len();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        result.inc_present(any.view());
+        result.freeze()
+    }
+}
+
+// ── Shared matrix helpers (also used by intmatrix.rs) ─────────────────────────

 fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
    (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
@@ -439,3 +498,30 @@ where T: Clone + Default {
    for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
    m
 }
+
+/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for
+/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the
+/// lower-triangle mirror.
+pub(crate) fn pairwise_matrix<T>(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
+where T: Copy + Default + Send {
+    let results: Vec<(usize, usize, T)> = upper_pairs(n)
+        .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
+    fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+}
+
+/// Same as `pairwise_matrix` but `f` returns two values that fill two
+/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard).
+pub(crate) fn pairwise2_matrix<T>(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2<T>, Array2<T>)
+where T: Copy + Default + Send {
+    let results: Vec<(usize, usize, T, T)> = upper_pairs(n)
+        .into_par_iter()
+        .map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) })
+        .collect();
+    let mut m0 = Array2::from_elem((n, n), T::default());
+    let mut m1 = Array2::from_elem((n, n), T::default());
+    for (i, j, a, b) in results {
+        m0[[i, j]] = a; m0[[j, i]] = a;
+        m1[[i, j]] = b; m1[[j, i]] = b;
+    }
+    (m0, m1)
+}
@@ -5,29 +5,25 @@ use std::path::{Path, PathBuf};
 use memmap2::{Mmap, MmapMut};

 use crate::reader::PersistentCompactIntVec;
+use crate::views::{BitSliceIter, BitSliceView, IntSliceView};

 const MAGIC: [u8; 4] = *b"PBIV";

 // Header: magic(4) + _pad(4) + n(8) = 16 bytes.
-// Data starts at offset 16, which is divisible by 8 → u64-aligned
-// (mmap base is page-aligned, 16 % 8 == 0).
+// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0).
 const HEADER_SIZE: usize = 16;

 #[inline]
-fn n_words(n: usize) -> usize {
-    n.div_ceil(64)
-}
+pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) }

 #[inline]
-fn n_bytes_for_words(n: usize) -> usize {
-    n_words(n) * 8
-}
+fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 }

-// ── Reader ────────────────────────────────────────────────────────────────────
+// ── PersistentBitVec ──────────────────────────────────────────────────────────

 pub struct PersistentBitVec {
    mmap: Mmap,
-    n: usize,
+    n:    usize,
    path: PathBuf,
 }

@@ -35,157 +31,145 @@ impl PersistentBitVec {
    pub fn open(path: &Path) -> io::Result<Self> {
        let mmap = unsafe { Mmap::map(&File::open(path)?)? };
        if mmap.len() < HEADER_SIZE {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "PBIV file too short",
-            ));
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short"));
        }
        if &mmap[0..4] != &MAGIC {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic"));
        }
        let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
-        Ok(Self {
-            mmap,
-            n,
-            path: path.to_path_buf(),
-        })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
    }

-    pub fn path(&self) -> &Path {
-        &self.path
-    }
-    pub fn len(&self) -> usize {
-        self.n
-    }
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
+    pub fn path(&self) -> &Path { &self.path }
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }

    pub fn get(&self, slot: usize) -> bool {
        (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
    }

-    // Used by iter() and get(): exact byte window, no padding.
-    fn data_bytes(&self) -> &[u8] {
-        &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)]
-    }
-
-    // Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8,
-    // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes.
+    // SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned.
    fn data_words(&self) -> &[u64] {
-        let nw = n_words(self.n);
+        let nw  = n_words(self.n);
        let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
        unsafe { std::slice::from_raw_parts(ptr, nw) }
    }

-    pub fn count_ones(&self) -> u64 {
-        // Padding bits in the last word are 0, so no masking needed.
-        self.data_words()
-            .iter()
-            .map(|w| w.count_ones() as u64)
-            .sum()
+    pub fn view(&self) -> BitSliceView<'_> {
+        BitSliceView::new(self.data_words(), self.n)
    }

-    pub fn count_zeros(&self) -> u64 {
-        self.n as u64 - self.count_ones()
-    }
+    pub fn words(&self) -> &[u64] { self.data_words() }

-    pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
-        let (inter, union) = self.partial_jaccard_dist(other);
-        if union == 0 {
-            return 0.0;
-        }
-        1.0 - inter as f64 / union as f64
-    }
+    pub fn count_ones(&self)  -> u64 { self.view().count_ones() }
+    pub fn count_zeros(&self) -> u64 { self.view().count_zeros() }

    pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        self.data_words()
-            .iter()
-            .zip(other.data_words())
-            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
-                (
-                    i + (a & b).count_ones() as u64,
-                    u + (a | b).count_ones() as u64,
-                )
-            })
+        self.view().partial_jaccard_dist(other.view())
+    }
+    pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 {
+        self.view().jaccard_dist(other.view())
    }
-
    pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 {
-        assert_eq!(self.n, other.n, "length mismatch");
-        self.data_words()
-            .iter()
-            .zip(other.data_words())
-            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
-            .sum()
+        self.view().hamming_dist(other.view())
    }

    pub fn iter(&self) -> BitIter<'_> {
-        BitIter {
-            bytes: self.data_bytes(),
-            slot: 0,
-            n: self.n,
-        }
+        BitIter { words: self.data_words(), slot: 0, n: self.n }
    }
 }

 impl<'a> IntoIterator for &'a PersistentBitVec {
    type Item = bool;
    type IntoIter = BitIter<'a>;
-    fn into_iter(self) -> BitIter<'a> {
-        self.iter()
-    }
+    fn into_iter(self) -> BitIter<'a> { self.iter() }
 }

+// ── BitIter ───────────────────────────────────────────────────────────────────
+
 pub struct BitIter<'a> {
-    bytes: &'a [u8],
-    slot: usize,
-    n: usize,
+    words: &'a [u64],
+    slot:  usize,
+    n:     usize,
 }

 impl ExactSizeIterator for BitIter<'_> {}

 impl Iterator for BitIter<'_> {
    type Item = bool;
-
    fn next(&mut self) -> Option<bool> {
-        if self.slot >= self.n {
-            return None;
-        }
-        let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0;
+        if self.slot >= self.n { return None; }
+        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
        self.slot += 1;
        Some(v)
    }
-
    fn size_hint(&self) -> (usize, Option<usize>) {
        let rem = self.n - self.slot;
        (rem, Some(rem))
    }
 }

-// ── Builder ───────────────────────────────────────────────────────────────────
+// ── PersistentBitVecBuilder ───────────────────────────────────────────────────

 pub struct PersistentBitVecBuilder {
    mmap: MmapMut,
-    n: usize,
+    n:    usize,
+    path: PathBuf,
 }

 impl PersistentBitVecBuilder {
    pub fn new(n: usize, path: &Path) -> io::Result<Self> {
        let file_size = HEADER_SIZE + n_bytes_for_words(n);
        let mut file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
+            .read(true).write(true).create(true).truncate(true)
            .open(path)?;
        file.write_all(&MAGIC)?;
-        file.write_all(&[0u8; 4])?; // padding
+        file.write_all(&[0u8; 4])?;
        file.write_all(&(n as u64).to_le_bytes())?;
        file.seek(SeekFrom::Start(0))?;
        file.set_len(file_size as u64)?;
        let mmap = unsafe { MmapMut::map_mut(&file)? };
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
+    }
+
+    pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
+        let file_size = HEADER_SIZE + n_bytes_for_words(n);
+        let file = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.set_len(file_size as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        mmap[0..4].copy_from_slice(&MAGIC);
+        mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes());
+        mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes);
+        Ok(Self { mmap, n, path: path.to_path_buf() })
+    }
+
+    /// Create an all-ones bit vector of length `n` at `path`.
+    ///
+    /// More efficient than `new(n, path)` + `not()`: the data is written as
+    /// 0xFF bytes in a single sequential pass, with no intermediate all-zeros state.
+    pub fn new_ones(n: usize, path: &Path) -> io::Result<Self> {
+        let nw        = n_words(n);
+        let file_size = HEADER_SIZE + nw * 8;
+        let mut file  = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.write_all(&MAGIC)?;
+        file.write_all(&[0u8; 4])?;
+        file.write_all(&(n as u64).to_le_bytes())?;
+        file.write_all(&vec![0xFFu8; nw * 8])?;
+        file.seek(SeekFrom::Start(0))?;
+        file.set_len(file_size as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        // Clear padding bits in the last word so trailing bits are always 0.
+        let rem = n % 64;
+        if rem != 0 {
+            let ptr   = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
+            let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
+            words[nw - 1] &= (1u64 << rem) - 1;
+        }
+        Ok(Self { mmap, n, path: path.to_path_buf() })
    }

    pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result<Self> {
@@ -193,86 +177,14 @@ impl PersistentBitVecBuilder {
        let file = OpenOptions::new().read(true).write(true).open(path)?;
        let mmap = unsafe { MmapMut::map_mut(&file)? };
        let n = source.len();
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
    }

-    pub fn len(&self) -> usize {
-        self.n
-    }
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
-
-    pub fn get(&self, slot: usize) -> bool {
-        (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
-    }
-
-    pub fn set(&mut self, slot: usize, value: bool) {
-        let byte = HEADER_SIZE + (slot >> 3);
-        let bit = 1u8 << (slot & 7);
-        if value {
-            self.mmap[byte] |= bit;
-        } else {
-            self.mmap[byte] &= !bit;
-        }
-    }
-
-    // SAFETY: same alignment argument as PersistentBitVec::data_words.
-    fn data_words_mut(&mut self) -> &mut [u64] {
-        let nw = n_words(self.n);
-        let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
-        unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
-    }
-
-    pub fn and(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw &= ow;
-        }
-    }
-
-    pub fn or(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw |= ow;
-        }
-    }
-
-    pub fn xor(&mut self, other: &PersistentBitVec) {
-        assert_eq!(self.n, other.n, "length mismatch");
-        for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) {
-            *sw ^= ow;
-        }
-    }
-
-    pub fn not(&mut self) {
-        let rem = self.n % 64;
-        let words = self.data_words_mut();
-        for w in words.iter_mut() {
-            *w ^= u64::MAX;
-        }
-        // Zero padding bits in the last word so count_ones / jaccard remain correct.
-        if rem != 0 {
-            if let Some(last) = words.last_mut() {
-                *last &= (1u64 << rem) - 1;
-            }
-        }
-    }
-
-    /// Convert a count vector to a bit vector: bit set iff count >= threshold.
-    /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead.
-    pub fn build_from_counts(
-        source: &PersistentCompactIntVec,
-        threshold: u32,
-        path: &Path,
-    ) -> io::Result<Self> {
+    pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result<Self> {
        let n = source.len();
        let file_size = HEADER_SIZE + n_bytes_for_words(n);
        let mut file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
+            .read(true).write(true).create(true).truncate(true)
            .open(path)?;
        file.write_all(&MAGIC)?;
        file.write_all(&[0u8; 4])?;
@@ -280,27 +192,157 @@ impl PersistentBitVecBuilder {
        file.seek(SeekFrom::Start(0))?;
        file.set_len(file_size as u64)?;
        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
-
        {
-            let nw = n_words(n);
+            let nw  = n_words(n);
            let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
            let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) };
            for (slot, count) in source.iter().enumerate() {
-                if count >= threshold {
-                    words[slot >> 6] |= 1u64 << (slot & 63);
-                }
+                if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); }
            }
        }
-
-        Ok(Self { mmap, n })
+        Ok(Self { mmap, n, path: path.to_path_buf() })
    }

-    /// Convert a count vector to a presence/absence bit vector (threshold = 1).
    pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
        Self::build_from_counts(source, 1, path)
    }

-    pub fn close(self) -> io::Result<()> {
-        self.mmap.flush()
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
+
+    pub fn get(&self, slot: usize) -> bool {
+        (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0
+    }
+
+    pub fn set(&mut self, slot: usize, value: bool) {
+        let bit = 1u64 << (slot & 63);
+        if value { self.data_words_mut()[slot >> 6] |=  bit; }
+        else     { self.data_words_mut()[slot >> 6] &= !bit; }
+    }
+
+    fn data_words(&self) -> &[u64] {
+        let nw  = n_words(self.n);
+        let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64;
+        unsafe { std::slice::from_raw_parts(ptr, nw) }
+    }
+
+    // SAFETY: same alignment argument as PersistentBitVec::data_words.
+    fn data_words_mut(&mut self) -> &mut [u64] {
+        let nw  = n_words(self.n);
+        let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64;
+        unsafe { std::slice::from_raw_parts_mut(ptr, nw) }
+    }
+
+    pub fn view(&self) -> BitSliceView<'_> {
+        BitSliceView::new(self.data_words(), self.n)
+    }
+
+    pub fn words(&self) -> &[u64] { self.data_words() }
+
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
+        assert_eq!(self.n, src.len(), "BitSliceView length mismatch");
+        self.data_words_mut().copy_from_slice(src.words());
+    }
+
+    pub fn and(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; }
+    }
+
+    pub fn or(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; }
+    }
+
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
+        assert_eq!(self.n, other.len(), "BitSliceView length mismatch");
+        for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; }
+    }
+
+    pub fn not(&mut self) {
+        let rem   = self.n % 64;
+        let words = self.data_words_mut();
+        for w in words.iter_mut() { *w ^= u64::MAX; }
+        if rem != 0 {
+            if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; }
+        }
+    }
+
+    /// OR in bits at slots where `pred(col[slot])` is true.
+    pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] |= mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); }
+        }
+    }
+
+    /// Clear bits at slots where `pred(col[slot])` is false.
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] &= !mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); }
+        }
+    }
+
+    /// Toggle bits at slots where `pred(col[slot])` is true.
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        assert_eq!(self.n, col.len(), "IntSliceView length mismatch");
+        let n = self.n;
+        let primary = col.primary_bytes();
+        let words = self.data_words_mut();
+        let nw = n_words(n);
+        for wi in 0..nw {
+            let base  = wi * 64;
+            let limit = (base + 64).min(n);
+            let mut mask = 0u64;
+            for bit in 0..(limit - base) {
+                let b = primary[base + bit];
+                if b < 255 && pred(b as u32) { mask |= 1u64 << bit; }
+            }
+            words[wi] ^= mask;
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); }
+        }
+    }
+
+    pub fn iter(&self) -> BitSliceIter<'_> {
+        self.view().iter()
+    }
+
+    pub fn close(self) -> io::Result<()> { self.mmap.flush() }
+
+    pub fn finish(self) -> io::Result<PersistentBitVec> {
+        let path = self.path.clone();
+        self.close()?;
+        PersistentBitVec::open(&path)
    }
 }
@@ -5,71 +5,57 @@ use std::path::{Path, PathBuf};

 use memmap2::MmapMut;

-use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv};
+use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry};
 use crate::reader::PersistentCompactIntVec;
+use crate::views::{BitSliceView, IntSliceView};

 pub struct PersistentCompactIntVecBuilder {
-    path: PathBuf,
-    mmap: MmapMut,
-    n: usize,
+    path:     PathBuf,
+    mmap:     MmapMut,
+    n:        usize,
    overflow: HashMap<usize, u32>,
 }

 impl PersistentCompactIntVecBuilder {
-    /// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately.
    pub fn new(n: usize, path: &Path) -> io::Result<Self> {
        let file = OpenOptions::new()
-            .read(true)
-            .write(true)
-            .create(true)
-            .truncate(true)
+            .read(true).write(true).create(true).truncate(true)
            .open(path)?;
        file.set_len((HEADER_SIZE + n) as u64)?;
        let mmap = unsafe { MmapMut::map_mut(&file)? };
-        Ok(Self {
-            path: path.to_path_buf(),
-            mmap,
-            n,
-            overflow: HashMap::new(),
-        })
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() })
+    }
+
+    pub fn from_raw_primary(primary: &[u8], overflow: HashMap<usize, u32>, path: &Path) -> io::Result<Self> {
+        let n = primary.len();
+        let file = OpenOptions::new()
+            .read(true).write(true).create(true).truncate(true)
+            .open(path)?;
+        file.set_len((HEADER_SIZE + n) as u64)?;
+        let mut mmap = unsafe { MmapMut::map_mut(&file)? };
+        mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary);
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
    }

-    /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM.
-    /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow).
    pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result<Self> {
        fs::copy(source.path(), path)?;
-
        let file = OpenOptions::new().read(true).write(true).open(path)?;
        let mmap = unsafe { MmapMut::map_mut(&file)? };
-
-        let n = source.len();
+        let n          = source.len();
        let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
        let data_offset = HEADER_SIZE + n;
-
        let mut overflow = HashMap::with_capacity(n_overflow);
        for i in 0..n_overflow {
-            let off = data_offset + i * OVERFLOW_ENTRY_SIZE;
-            let slot  = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
-            let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap());
+            let (slot, value) = parse_overflow_entry(&mmap, data_offset, i);
            overflow.insert(slot, value);
        }
-
-        Ok(Self {
-            path: path.to_path_buf(),
-            mmap,
-            n,
-            overflow,
-        })
+        Ok(Self { path: path.to_path_buf(), mmap, n, overflow })
    }

-    /// Get the value at the given slot, handling overflow if necessary.
    pub fn get(&self, slot: usize) -> u32 {
        match self.mmap[HEADER_SIZE + slot] {
-            255 => *self
-                .overflow
-                .get(&slot)
-                .expect("sentinel without overflow entry"),
-            v => v as u32,
+            255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"),
+            v   => v as u32,
        }
    }

@@ -83,61 +69,201 @@ impl PersistentCompactIntVecBuilder {
        }
    }

-    pub fn len(&self) -> usize {
-        self.n
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }
+
+    pub fn primary_bytes(&self)     -> &[u8]      { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] }
+    pub fn clear_overflow(&mut self) { self.overflow.clear(); }
+
+    pub fn sum(&self) -> u64 {
+        byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied())
+    }
+    pub fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n])
    }

-    pub fn is_empty(&self) -> bool {
-        self.n == 0
+    pub fn view(&self) -> IntSliceView<'_> {
+        // Builder overflow is a HashMap, not sorted raw bytes — convert on the fly
+        // by collecting into a sorted vec and storing in a thread-local buffer.
+        // For read-back during building, just call get(slot) directly.
+        // view() is primarily useful AFTER freeze (on PersistentCompactIntVec).
+        // Here we expose it via a zero-alloc path: primary only, no overflow raw.
+        // Callers that need overflow_entries during building use overflow_entries().
+        let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n];
+        IntSliceView::new(primary, &[], 0, self.n)
    }

-    pub fn min(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            if other_val < self.get(slot) {
-                self.set(slot, other_val);
+    pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + '_ {
+        self.overflow.iter().map(|(&k, &v)| (k, v))
+    }
+
+    pub fn inc(&mut self, slot: usize) {
+        let v = self.get(slot);
+        self.set(slot, v.saturating_add(1));
+    }
+
+    // ── Computation methods ───────────────────────────────────────────────────
+
+    /// Increment one counter per 1-bit of `col`.  Safe for any group size.
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
+        let n = self.n;
+        for (wi, &word) in col.words().iter().enumerate() {
+            if word == 0 { continue; }
+            let mut w = word;
+            while w != 0 {
+                let bit  = w.trailing_zeros() as usize;
+                let slot = wi * 64 + bit;
+                if slot < n { self.inc(slot); }
+                w &= w - 1;
            }
        }
    }

-    pub fn max(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            if other_val > self.get(slot) {
-                self.set(slot, other_val);
+    /// Increment one counter per 1-bit of `col`, using raw u8 arithmetic.
+    /// Caller guarantees no counter will reach 255 (group size < 255).
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+        {
+            let primary = self.primary_bytes_mut();
+            let n       = primary.len();
+            for (wi, &word) in col.words().iter().enumerate() {
+                if word == 0 { continue; }
+                let mut w = word;
+                while w != 0 {
+                    let bit  = w.trailing_zeros() as usize;
+                    let s    = wi * 64 + bit;
+                    if s < n { primary[s] += 1; }
+                    w &= w - 1;
+                }
+            }
+        }
+        debug_assert!(
+            !self.primary_bytes().contains(&255),
+            "sentinel 255 reached in inc_present_fast — group size must be < 255"
+        );
+    }
+
+    /// Two-pass: primary bytes then overflow.  Increments `self[slot]` for each
+    /// slot where `pred(col[slot])` is true.  Safe for any group size.
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        let n = col.len();
+        for slot in 0..n {
+            let b = col.primary_bytes()[slot];
+            if b < 255 && pred(b as u32) {
+                self.inc(slot);
+            }
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { self.inc(slot); }
+        }
+    }
+
+    /// Fast two-pass: raw u8 arithmetic.  Caller guarantees no counter reaches 255.
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        let n = col.len();
+        {
+            let primary = self.primary_bytes_mut();
+            for slot in 0..n {
+                let b = col.primary_bytes()[slot];
+                if b < 255 && pred(b as u32) {
+                    primary[slot] += 1;
+                }
+            }
+        }
+        for (slot, val) in col.overflow_entries() {
+            if pred(val) { self.primary_bytes_mut()[slot] += 1; }
+        }
+        debug_assert!(
+            !self.primary_bytes().contains(&255),
+            "sentinel 255 reached in inc_predicate_fast — group size must be < 255"
+        );
+    }
+
+    pub fn add(&mut self, other: IntSliceView<'_>) {
+        let n = self.n;
+        for s in 0..n {
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 && ob < 255 {
+                let sum = sb as u32 + ob as u32;
+                if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; }
+                else         { self.set(s, sum); }
+            } else {
+                let sv = self.get(s);
+                let ov = other.get(s);
+                self.set(s, sv + ov);
            }
        }
    }

-    pub fn add(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            let cur = self.get(slot);
-            self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add"));
+    pub fn min(&mut self, other: IntSliceView<'_>) {
+        let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect();
+        let other_ov: HashMap<usize, u32> = other.overflow_entries().collect();
+        self.clear_overflow();
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b < *a { *a = b; }
+        }
+        for (slot, self_val) in self_ov {
+            if let Some(&other_val) = other_ov.get(&slot) {
+                self.set(slot, self_val.min(other_val));
+            }
        }
    }

-    pub fn diff(&mut self, other: &PersistentCompactIntVec) {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        for (slot, other_val) in other.iter().enumerate() {
-            self.set(slot, self.get(slot).saturating_sub(other_val));
+    pub fn max(&mut self, other: IntSliceView<'_>) {
+        for (slot, other_val) in other.overflow_entries() {
+            let sv = self.get(slot);
+            self.set(slot, sv.max(other_val));
+        }
+        for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) {
+            if b > *a { *a = b; }
+        }
+    }
+
+    pub fn diff(&mut self, other: IntSliceView<'_>) {
+        let n = self.n;
+        for s in 0..n {
+            let sb = self.primary_bytes()[s];
+            let ob = other.primary_bytes()[s];
+            if sb < 255 {
+                self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 };
+            } else {
+                let sv = self.get(s);
+                let ov = if ob < 255 { ob as u32 } else { other.get(s) };
+                self.set(s, sv.saturating_sub(ov));
+            }
+        }
+    }
+
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
+        let n = self.n;
+        for (wi, &word) in mask.words().iter().enumerate() {
+            if word == u64::MAX { continue; }
+            let mut zeros = !word;
+            while zeros != 0 {
+                let bit = zeros.trailing_zeros() as usize;
+                let s   = wi * 64 + bit;
+                if s < n {
+                    let b = self.primary_bytes()[s];
+                    if b != 0 { self.set(s, 0); }
+                }
+                zeros &= zeros - 1;
+            }
        }
    }

-    /// Flush the primary mmap, then write sorted overflow data + index and fix the header.
    pub fn close(self) -> io::Result<()> {
        self.mmap.flush()?;
-        let Self {
-            path,
-            mmap,
-            n,
-            overflow,
-        } = self;
+        let Self { path, mmap, n, overflow } = self;
        drop(mmap);
-
        let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect();
        entries.sort_unstable_by_key(|&(slot, _)| slot);
-
        finalize_pciv(&path, n, &entries)
    }
+
+    pub fn finish(self) -> io::Result<PersistentCompactIntVec> {
+        let path = self.path.clone();
+        self.close()?;
+        PersistentCompactIntVec::open(&path)
+    }
 }
@@ -0,0 +1,137 @@
+use std::io;
+
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::TempCompactIntVec;
+
+// ── ColGroup ──────────────────────────────────────────────────────────────────
+
+/// A named subset of columns, identified by their indices within the matrix.
+///
+/// Defined once at the index level; the same indices are valid across all
+/// partitions and layers because the column structure (samples / genomes) is
+/// identical everywhere — only the row space (kmer slots) is partitioned.
+pub struct ColGroup {
+    pub name:    String,
+    pub indices: Vec<usize>,
+}
+
+impl ColGroup {
+    pub fn new(name: impl Into<String>, indices: Vec<usize>) -> Self {
+        Self { name: name.into(), indices }
+    }
+}
+
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────
+
+/// Per-matrix group aggregations.
+///
+/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`,
+/// `partial_group_min`, `partial_group_max` are the primitives; each impl must
+/// provide all five.
+///
+/// `partial_group_all` and `partial_group_none` have default implementations
+/// derived from `partial_group_presence_count` and should rarely need overriding.
+pub trait MatrixGroupOps {
+    /// Per-slot count of group columns whose value ≥ `threshold`.
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot sum of values across all group columns.
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot OR: 1 if any group column has value ≥ `threshold`.
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec>;
+
+    /// Per-slot min value across all group columns (0 if group is empty).
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot max value across all group columns (0 if group is empty).
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec>;
+
+    /// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`.
+    fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
+        let counts = self.partial_group_presence_count(g, threshold)?;
+        let n = counts.len();
+        let n_required = g.indices.len() as u32;
+        let mut b = TempBitVecBuilder::new(n)?;
+        b.or_where(counts.view(), |v| v >= n_required);
+        b.freeze()
+    }
+
+    /// Per-slot NOR: 1 if NO group column has value ≥ `threshold`.
+    fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
+        let counts = self.partial_group_presence_count(g, threshold)?;
+        let n = counts.len();
+        let mut b = TempBitVecBuilder::new(n)?;
+        b.or_where(counts.view(), |v| v == 0);
+        b.freeze()
+    }
+}
+
+// ── FilterMask — expression tree for column-based slot filters ────────────────
+
+/// A composable filter expression that can be evaluated against a matrix
+/// using only column operations (no MPHF lookup per kmer).
+///
+/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]:
+/// a slot contributes to the count when its value is **≥ threshold**.
+/// To match the row-level filter (`value > t`), callers should pass `t + 1`.
+#[derive(Debug, Clone)]
+pub enum FilterMask {
+    /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`.
+    PresenceGeq { indices: Vec<usize>, threshold: u32, min_count: usize },
+    /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`.
+    PresenceLeq { indices: Vec<usize>, threshold: u32, max_count: usize },
+    /// Slot passes if sum of values across `indices` columns is ≥ `min_sum`.
+    SumGeq { indices: Vec<usize>, min_sum: u32 },
+    /// Slot passes if sum of values across `indices` columns is ≤ `max_sum`.
+    SumLeq { indices: Vec<usize>, max_sum: u32 },
+    /// Slot passes if it passes all sub-expressions. Empty `And` is always true.
+    And(Vec<FilterMask>),
+}
+
+/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec`
+/// where bit=1 means the slot passes the filter.
+pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result<TempBitVec> {
+    match expr {
+        FilterMask::PresenceGeq { indices, threshold, min_count } => {
+            let g = ColGroup::new("", indices.clone());
+            let counts = mat.partial_group_presence_count(&g, *threshold)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let mc = *min_count as u32;
+            b.or_where(counts.view(), |v| v >= mc);
+            b.freeze()
+        }
+        FilterMask::PresenceLeq { indices, threshold, max_count } => {
+            let g = ColGroup::new("", indices.clone());
+            let counts = mat.partial_group_presence_count(&g, *threshold)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let mc = *max_count as u32;
+            b.or_where(counts.view(), |v| v <= mc);
+            b.freeze()
+        }
+        FilterMask::SumGeq { indices, min_sum } => {
+            let g = ColGroup::new("", indices.clone());
+            let sums = mat.partial_group_sum(&g)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let ms = *min_sum;
+            b.or_where(sums.view(), |v| v >= ms);
+            b.freeze()
+        }
+        FilterMask::SumLeq { indices, max_sum } => {
+            let g = ColGroup::new("", indices.clone());
+            let sums = mat.partial_group_sum(&g)?;
+            let mut b = TempBitVecBuilder::new(n)?;
+            let ms = *max_sum;
+            b.or_where(sums.view(), |v| v <= ms);
+            b.freeze()
+        }
+        FilterMask::And(parts) => {
+            let mut b = TempBitVecBuilder::new_ones(n)?;
+            for part in parts {
+                let m = eval_filter_mask(part, mat, n)?;
+                b.and(m.view());
+            }
+            b.freeze()
+        }
+    }
+}
@@ -13,6 +13,44 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12;
 // Index entry: slot(u64) + pos(u64) = 16 bytes.
 pub const INDEX_ENTRY_SIZE: usize = 16;

+/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels.
+///
+/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values.
+/// `overflow` yields the true values (≥ 255) for each sentinel, in any order.
+#[inline]
+pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator<Item = u32>) -> u64 {
+    let raw: u64 = primary.iter().map(|&b| b as u64).sum();
+    let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64));
+    raw - 255 * n + ov
+}
+
+/// Count non-zero values in a compact-int primary byte slice.
+///
+/// Overflow sentinels (255) are always non-zero by construction, so a single
+/// `b != 0` test is sufficient — no overflow map lookup needed.
+#[inline]
+pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 {
+    primary.iter().filter(|&&b| b != 0).count() as u64
+}
+
+/// Parse a single overflow entry `(slot, value)` from a byte slice.
+#[inline]
+pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) {
+    let off = base + i * OVERFLOW_ENTRY_SIZE;
+    let slot  = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
+    let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap());
+    (slot, value)
+}
+
+/// Parse a single sparse-index entry `(slot, pos)` from a byte slice.
+#[inline]
+pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) {
+    let off = base + i * INDEX_ENTRY_SIZE;
+    let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize;
+    let pos  = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize;
+    (slot, pos)
+}
+
 // Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries).
 pub const L1_INDEX_ENTRIES: usize = 2048;

@@ -1,16 +1,20 @@
-use std::cmp::Ordering;
 use std::fs::{self, File};
-use std::io::{self, Write as _};
+use std::io::{self, BufWriter, Write as _};
 use std::path::{Path, PathBuf};

 use memmap2::Mmap;
 use ndarray::{Array1, Array2};
 use rayon::prelude::*;

+use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix};
 use crate::builder::PersistentCompactIntVecBuilder;
-use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE};
+use crate::colgroup::{ColGroup, MatrixGroupOps};
+use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE};
 use crate::meta::MatrixMeta;
 use crate::reader::PersistentCompactIntVec;
+use crate::tempbitvec::{TempBitVec, TempBitVecBuilder};
+use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
+use crate::views::IntSliceView;

 fn col_path(dir: &Path, col: usize) -> PathBuf {
    dir.join(format!("col_{col:06}.pciv"))
@@ -41,9 +45,7 @@ impl ColumnarCompactIntMatrix {
    }

    pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
-        for (c, col) in self.cols.iter().enumerate() {
-            buf[c] = col.get(slot);
-        }
+        for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); }
    }

    pub(crate) fn sum(&self) -> Array1<u64> {
@@ -54,50 +56,35 @@ impl ColumnarCompactIntMatrix {
        Array1::from_vec(sums)
    }

-    pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j)))
-    }
-
-    pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
-        self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j)))
-    }
-
-    pub(crate) fn partial_threshold_jaccard_dist_matrix(
-        &self, threshold: u32,
-    ) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols();
-        let pairs = upper_pairs(n);
-        let results: Vec<(usize, usize, u64, u64)> = pairs
+    pub(crate) fn count_nonzero(&self) -> Array1<u64> {
+        let counts: Vec<u64> = (0..self.n_cols())
            .into_par_iter()
-            .map(|(i, j)| {
-                let (inter, union) =
-                    self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold);
-                (i, j, inter, union)
-            })
+            .map(|c| self.col(c).count_nonzero())
            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        Array1::from_vec(counts)
    }

+    pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j)))
+    }
+    pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
+        pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j)))
+    }
+    pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2<u64>, Array2<u64>) {
+        pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold))
+    }
    pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
            self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
        })
    }
-
    pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
            self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
        })
    }
-
    pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| {
+        pairwise_matrix(self.n_cols(), |i, j| {
            self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64)
        })
    }
@@ -110,20 +97,6 @@ impl ColumnarCompactIntMatrix {
        meta.n_cols += 1;
        meta.save(dir)
    }
-
-    fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2<f64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, f64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols();
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
-    }
 }

 // ── PackedCompactIntMatrix ────────────────────────────────────────────────────
@@ -131,13 +104,10 @@ impl ColumnarCompactIntMatrix {
 const PCMX_MAGIC:  [u8; 4] = *b"PCMX";
 const PCMX_HEADER: usize   = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8)

-/// Per-column metadata pre-parsed from the embedded PCIV header.
 struct ColInfo {
-    primary_start: usize,  // absolute mmap offset to primary array
-    data_offset:   usize,  // absolute mmap offset to overflow array
+    primary_start: usize,
+    data_offset:   usize,
    n_overflow:    usize,
-    step:          usize,
-    index:         Vec<(usize, usize)>,
 }

 pub struct PackedCompactIntMatrix {
@@ -163,61 +133,31 @@ impl PackedCompactIntMatrix {
        for c in 0..n_cols {
            let off_pos  = PCMX_HEADER + c * 8;
            let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize;
-            // Parse embedded PCIV header at col_base
-            let n_ov    = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
-            let n_idx   = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize;
-            let step    = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize;
-            let n_pciv  = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap())  as usize;
-
+            let n_ov   = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize;
+            let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap())  as usize;
            let primary_start = col_base + HEADER_SIZE;
            let data_offset   = primary_start + n_pciv;
-            let index_offset  = data_offset + n_ov * OVERFLOW_ENTRY_SIZE;
-
-            let mut index = Vec::with_capacity(n_idx);
-            for i in 0..n_idx {
-                let ioff  = index_offset + i * INDEX_ENTRY_SIZE;
-                let slot  = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap())   as usize;
-                let pos   = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize;
-                index.push((slot, pos));
-            }
-            columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index });
+            columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov });
        }
-
        Ok(Self { mmap, n_rows, n_cols, columns })
    }

-    #[inline]
-    pub(crate) fn get(&self, col: usize, slot: usize) -> u32 {
-        let ci = &self.columns[col];
-        let v = self.mmap[ci.primary_start + slot];
-        if v < 255 { return v as u32; }
-        self.overflow_get(ci, slot)
+    pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> {
+        let ci = &self.columns[c];
+        let primary     = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows];
+        let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE];
+        IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows)
    }

-    fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 {
-        let (pos_start, pos_end) = if ci.step == 0 {
-            (0, ci.n_overflow)
-        } else {
-            let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
-            let start = ci.index[i].1;
-            let end   = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow };
-            (start, end)
-        };
-        let mut lo = pos_start;
-        let mut hi = pos_end;
-        while lo < hi {
-            let mid = lo + (hi - lo) / 2;
-            let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE;
-            let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize;
-            match stored.cmp(&slot) {
-                Ordering::Equal   => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()),
-                Ordering::Less    => lo = mid + 1,
-                Ordering::Greater => hi = mid,
-            }
-        }
-        panic!("slot {slot} marked overflow but not found")
+    pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
+        let view = self.col_view(c);
+        let overflow: std::collections::HashMap<usize, u32> = view.overflow_entries().collect();
+        PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path)
    }

+    #[inline]
+    pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) }
+
    pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) {
        for c in 0..self.n_cols { buf[c] = self.get(c, slot); }
    }
@@ -228,141 +168,96 @@ impl PackedCompactIntMatrix {

    pub(crate) fn sum(&self) -> Array1<u64> {
        Array1::from_vec(
-            (0..self.n_cols).into_par_iter()
-                .map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum())
-                .collect()
+            (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect()
        )
    }

-    // ── Pair primitives ───────────────────────────────────────────────────────
+    pub(crate) fn count_nonzero(&self) -> Array1<u64> {
+        Array1::from_vec(
+            (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect()
+        )
+    }

    fn pair_partial_bray(&self, i: usize, j: usize) -> u64 {
-        (0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum()
    }
-
    fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 {
-        (0..self.n_rows).map(|s| {
-            let d = self.get(i, s) as f64 - self.get(j, s) as f64;
-            d * d
-        }).sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum()
    }
-
    fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) {
-        let (mut inter, mut union) = (0u64, 0u64);
-        for s in 0..self.n_rows {
-            let a = self.get(i, s) >= t;
-            let b = self.get(j, s) >= t;
-            if a && b { inter += 1; }
-            if a || b { union += 1; }
-        }
-        (inter, union)
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .fold((0u64, 0u64), |(inter, uni), (a, b)| {
+                let ap = a >= t; let bp = b >= t;
+                (inter + (ap & bp) as u64, uni + (ap | bp) as u64)
+            })
    }
-
    fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
        if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            (self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj)
-        }).sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum()
    }
-
    fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
        if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj;
-            d * d
-        }).sum()
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum()
    }
-
    fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 {
        if si == 0.0 || sj == 0.0 { return 0.0; }
-        (0..self.n_rows).map(|s| {
-            let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt();
-            d * d
-        }).sum()
-    }
-
-    // ── Matrix methods ────────────────────────────────────────────────────────
-
-    fn pairwise<T>(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2<T>
-    where T: Clone + Default + Send {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, T)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) }))
-    }
-
-    fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2<u64> {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64)> = upper_pairs(n)
-            .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect();
-        fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v)))
+        self.col_view(i).iter().zip(self.col_view(j).iter())
+            .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum()
    }

    pub(crate) fn partial_bray_dist_matrix(&self) -> Array2<u64> {
-        self.pairwise_u64(|i, j| self.pair_partial_bray(i, j))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j))
    }
-
-
    pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_euclidean(i, j))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j))
    }
-
    pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2<u64>, Array2<u64>) {
-        let n = self.n_cols;
-        let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n)
-            .into_par_iter()
-            .map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) })
-            .collect();
-        let mut inter_m = Array2::zeros((n, n));
-        let mut union_m = Array2::zeros((n, n));
-        for (i, j, inter, union) in results {
-            inter_m[[i, j]] = inter; inter_m[[j, i]] = inter;
-            union_m[[i, j]] = union; union_m[[j, i]] = union;
-        }
-        (inter_m, union_m)
+        pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t))
    }
-
    pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64))
    }
-
    pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64))
    }
-
    pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
-        self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
+        pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64))
    }
-
 }

 /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files.
 pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> {
-    let meta = MatrixMeta::load(dir)?;
+    let packed_path = dir.join("matrix.pcmx");
+    if packed_path.exists() {
+        if let Ok(meta) = MatrixMeta::load(dir) {
+            for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); }
+            let _ = fs::remove_file(dir.join("meta.json"));
+        }
+        return Ok(());
+    }
+    let meta   = MatrixMeta::load(dir)?;
    let n_cols = meta.n_cols;
-
-    let col_files: Vec<Vec<u8>> = (0..n_cols)
-        .map(|c| fs::read(col_path(dir, c)))
+    let col_sizes: Vec<u64> = (0..n_cols)
+        .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len()))
        .collect::<io::Result<_>>()?;
-
-    let header_size = PCMX_HEADER + n_cols * 8;
+    let header_size = (PCMX_HEADER + n_cols * 8) as u64;
    let mut col_offset = header_size;
    let mut offsets = Vec::with_capacity(n_cols);
-    for data in &col_files {
-        offsets.push(col_offset as u64);
-        col_offset += data.len();
-    }
-
-    let packed_path = dir.join("matrix.pcmx");
-    let mut file = File::create(&packed_path)?;
-    file.write_all(&PCMX_MAGIC)?;
-    file.write_all(&[0u8; 4])?;
-    file.write_all(&(meta.n as u64).to_le_bytes())?;
-    file.write_all(&(n_cols as u64).to_le_bytes())?;
-    for &off in &offsets { file.write_all(&off.to_le_bytes())?; }
-    for data in &col_files { file.write_all(data)?; }
-    drop(file);
-
+    for &size in &col_sizes { offsets.push(col_offset); col_offset += size; }
+    let tmp_path = dir.join("matrix.pcmx.tmp");
+    let mut out = BufWriter::new(File::create(&tmp_path)?);
+    out.write_all(&PCMX_MAGIC)?;
+    out.write_all(&[0u8; 4])?;
+    out.write_all(&(meta.n as u64).to_le_bytes())?;
+    out.write_all(&(n_cols as u64).to_le_bytes())?;
+    for &off in &offsets { out.write_all(&off.to_le_bytes())?; }
+    for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; }
+    out.flush()?;
+    drop(out);
+    fs::rename(&tmp_path, &packed_path)?;
    for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; }
    fs::remove_file(dir.join("meta.json"))?;
    Ok(())
@@ -376,18 +271,14 @@ pub enum PersistentCompactIntMatrix {
 }

 impl PersistentCompactIntMatrix {
-    /// Open from `layer_dir`, auto-detecting Packed or Columnar.
    pub fn open(layer_dir: &Path) -> io::Result<Self> {
        let counts_dir = layer_dir.join("counts");
-
        if counts_dir.join("matrix.pcmx").exists() {
            return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?));
        }
-
        if MatrixMeta::load(&counts_dir).is_ok() {
            return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?));
        }
-
        Err(io::Error::new(
            io::ErrorKind::NotFound,
            format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()),
@@ -397,7 +288,6 @@ impl PersistentCompactIntMatrix {
    pub fn n(&self) -> usize {
        match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows }
    }
-
    pub fn n_cols(&self) -> usize {
        match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols }
    }
@@ -409,18 +299,32 @@ impl PersistentCompactIntMatrix {
        }
    }

+    pub fn col_view(&self, c: usize) -> IntSliceView<'_> {
+        match self {
+            Self::Columnar(m) => m.col(c).view(),
+            Self::Packed(m)   => m.col_view(c),
+        }
+    }
+
+    pub fn col_persist(&self, c: usize, path: &Path) -> io::Result<PersistentCompactIntVecBuilder> {
+        match self {
+            Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path),
+            Self::Packed(m)   => m.col_persist(c, path),
+        }
+    }
+
    pub fn row(&self, slot: usize) -> Box<[u32]> {
        match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) }
    }
-
    pub fn fill_row(&self, slot: usize, buf: &mut [u32]) {
        match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) }
    }
-
    pub fn sum(&self) -> Array1<u64> {
        match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() }
    }
-
+    pub fn count_nonzero(&self) -> Array1<u64> {
+        match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() }
+    }
    pub fn partial_bray_dist_matrix(&self) -> Array2<u64> {
        match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() }
    }
@@ -439,7 +343,6 @@ impl PersistentCompactIntMatrix {
    pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1<u64>) -> Array2<f64> {
        match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) }
    }
-
    pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> {
        ColumnarCompactIntMatrix::append_column(dir, value_of)
    }
@@ -451,15 +354,16 @@ use crate::traits::{ColumnWeights, CountPartials};

 impl ColumnWeights for PersistentCompactIntMatrix {
    fn col_weights(&self) -> Array1<u64> { self.sum() }
+    fn partial_kmer_counts(&self) -> Array1<u64> { self.count_nonzero() }
 }

 impl CountPartials for PersistentCompactIntMatrix {
-    fn partial_bray(&self) -> Array2<u64>                        { self.partial_bray_dist_matrix() }
-    fn partial_euclidean(&self) -> Array2<f64>                   { self.partial_euclidean_dist_matrix() }
+    fn partial_bray(&self) -> Array2<u64>                                 { self.partial_bray_dist_matrix() }
+    fn partial_euclidean(&self) -> Array2<f64>                            { self.partial_euclidean_dist_matrix() }
    fn partial_threshold_jaccard(&self, t: u32) -> (Array2<u64>, Array2<u64>) { self.partial_threshold_jaccard_dist_matrix(t) }
-    fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64>     { self.partial_relfreq_bray_dist_matrix(g) }
-    fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64> { self.partial_relfreq_euclidean_dist_matrix(g) }
-    fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64>         { self.partial_hellinger_euclidean_dist_matrix(g) }
+    fn partial_relfreq_bray(&self, g: &Array1<u64>) -> Array2<f64>        { self.partial_relfreq_bray_dist_matrix(g) }
+    fn partial_relfreq_euclidean(&self, g: &Array1<u64>) -> Array2<f64>   { self.partial_relfreq_euclidean_dist_matrix(g) }
+    fn partial_hellinger(&self, g: &Array1<u64>) -> Array2<f64>           { self.partial_hellinger_euclidean_dist_matrix(g) }
 }

 // ── Builder ───────────────────────────────────────────────────────────────────
@@ -475,30 +379,88 @@ impl PersistentCompactIntMatrixBuilder {
        fs::create_dir_all(dir)?;
        Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 })
    }
-
    pub fn n(&self)      -> usize { self.n }
    pub fn n_cols(&self) -> usize { self.n_cols }
-
    pub fn add_col(&mut self) -> io::Result<PersistentCompactIntVecBuilder> {
        let path = col_path(&self.dir, self.n_cols);
        self.n_cols += 1;
        PersistentCompactIntVecBuilder::new(self.n, &path)
    }

+    pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> {
+        src.make_persistent(&col_path(&self.dir, self.n_cols))?;
+        self.n_cols += 1;
+        Ok(())
+    }
+
+    pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> {
+        let path = col_path(&self.dir, self.n_cols);
+        self.n_cols += 1;
+        let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?;
+        b.inc_present(src.view());
+        b.close()
+    }
+
    pub fn close(self) -> io::Result<()> {
        MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir)
    }
 }

-// ── Helpers ───────────────────────────────────────────────────────────────────
+// ── MatrixGroupOps ────────────────────────────────────────────────────────────

-fn upper_pairs(n: usize) -> Vec<(usize, usize)> {
-    (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect()
-}
+impl MatrixGroupOps for PersistentCompactIntMatrix {
+    fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        if g.indices.len() < 255 {
+            let mut builder = TempCompactIntVecBuilder::new(n)?;
+            for &c in &g.indices {
+                builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
+            }
+            builder.freeze()
+        } else {
+            let mut result = TempCompactIntVecBuilder::new(n)?;
+            for chunk in g.indices.chunks(254) {
+                let mut chunk_b = TempCompactIntVecBuilder::new(n)?;
+                for &c in chunk {
+                    chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold);
+                }
+                let frozen = chunk_b.freeze()?;
+                result.add(frozen.view());
+            }
+            result.freeze()
+        }
+    }

-fn fill_symmetric<T>(n: usize, vals: impl Iterator<Item = (usize, usize, T, T)>) -> Array2<T>
-where T: Clone + Default {
-    let mut m = Array2::from_elem((n, n), T::default());
-    for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; }
-    m
+    fn partial_group_sum(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        for &c in &g.indices { result.add(self.col_view(c)); }
+        result.freeze()
+    }
+
+    fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result<TempBitVec> {
+        let n = self.n();
+        let mut result = TempBitVecBuilder::new(n)?;
+        for &c in &g.indices {
+            result.or_where(self.col_view(c), |v| v >= threshold);
+        }
+        result.freeze()
+    }
+
+    fn partial_group_min(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        if let Some((&first, rest)) = g.indices.split_first() {
+            result.add(self.col_view(first));
+            for &c in rest { result.min(self.col_view(c)); }
+        }
+        result.freeze()
+    }
+
+    fn partial_group_max(&self, g: &ColGroup) -> io::Result<TempCompactIntVec> {
+        let n = self.n();
+        let mut result = TempCompactIntVecBuilder::new(n)?;
+        for &c in &g.indices { result.max(self.col_view(c)); }
+        result.freeze()
+    }
 }
@@ -23,11 +23,6 @@ impl LayerMeta {
    }

    fn parse(s: &str) -> Option<Self> {
-        let key = "\"n\":";
-        let pos = s.find(key)? + key.len();
-        let rest = s[pos..].trim_start();
-        let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len());
-        let n = rest[..end].parse().ok()?;
-        Some(Self { n })
+        Some(Self { n: crate::meta::field(s, "n")? })
    }
 }
@@ -1,20 +1,28 @@
 mod bitvec;
 mod bitmatrix;
 mod builder;
+mod colgroup;
 mod format;
 mod intmatrix;
 mod layer_meta;
 mod meta;
 mod reader;
+mod tempbitvec;
+mod tempintvec;
+mod views;
 pub mod traits;

 pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder};
 pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix};
 pub use builder::PersistentCompactIntVecBuilder;
+pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
 pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
 pub use layer_meta::LayerMeta;
-pub use reader::PersistentCompactIntVec;
+pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
+pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
+pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
 pub use traits::{BitPartials, ColumnWeights, CountPartials};
+pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};

 #[cfg(test)]
 #[path = "tests/mod.rs"]
@@ -23,7 +23,7 @@ fn parse(s: &str) -> Option<MatrixMeta> {
    Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? })
 }

-fn field(s: &str, name: &str) -> Option<usize> {
+pub(crate) fn field(s: &str, name: &str) -> Option<usize> {
    let key = format!("\"{}\":", name);
    let pos = s.find(&key)? + key.len();
    let rest = s[pos..].trim_start();
@@ -4,7 +4,8 @@ use std::path::{Path, PathBuf};

 use memmap2::Mmap;

-use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE};
+use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry};
+use crate::views::IntSliceView;

 pub struct PersistentCompactIntVec {
    mmap: Mmap,
@@ -18,100 +19,60 @@ pub struct PersistentCompactIntVec {
 }

 impl PersistentCompactIntVec {
-    /// Opens a persistent compact int vector from the given path.
    pub fn open(path: &Path) -> io::Result<Self> {
        let mmap = unsafe { Mmap::map(&File::open(path)?)? };

        if mmap.len() < HEADER_SIZE {
-            return Err(io::Error::new(
-                io::ErrorKind::InvalidData,
-                "PCIV file too short",
-            ));
+            return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short"));
        }
        if &mmap[0..4] != &MAGIC {
            return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic"));
        }

-        let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize;
+        let n          = u64::from_le_bytes(mmap[8..16].try_into().unwrap())  as usize;
        let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize;
-        let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
-        let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;
+        let n_index    = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize;
+        let step       = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize;

        let primary_offset = HEADER_SIZE;
-        let data_offset = primary_offset + n;
-        let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;
+        let data_offset    = primary_offset + n;
+        let index_offset   = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE;

        let mut index = Vec::with_capacity(n_index);
        for i in 0..n_index {
-            let off = index_offset + i * INDEX_ENTRY_SIZE;
-            let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize;
-            let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize;
-            index.push((slot, pos));
+            index.push(parse_index_entry(&mmap, index_offset, i));
        }

-        Ok(Self {
-            mmap,
-            n,
-            n_overflow,
-            step,
-            index,
-            primary_offset,
-            data_offset,
-            path: path.to_path_buf(),
-        })
+        Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() })
    }

-    /// Returns the path of the compact int vector file.
-    pub fn path(&self) -> &Path {
-        &self.path
-    }
+    pub fn path(&self) -> &Path { &self.path }
+    pub fn len(&self)      -> usize { self.n }
+    pub fn is_empty(&self) -> bool  { self.n == 0 }

-    /// Returns the length of the compact int vector.
-    pub fn len(&self) -> usize {
-        self.n
-    }
-
-    /// Returns whether the compact int vector is empty.
-    pub fn is_empty(&self) -> bool {
-        self.n == 0
-    }
-
-    /// Returns the value at the given slot.
    pub fn get(&self, slot: usize) -> u32 {
        match self.mmap[self.primary_offset + slot] {
            255 => self.overflow_get(slot),
-            v => v as u32,
+            v   => v as u32,
        }
    }

-    /// Returns the value at the given slot from the overflow region.
    fn overflow_get(&self, slot: usize) -> u32 {
-        let pos_start;
-        let pos_end;
-
-        if self.step == 0 {
-            pos_start = 0;
-            pos_end = self.n_overflow;
+        let (pos_start, pos_end) = if self.step == 0 {
+            (0, self.n_overflow)
        } else {
-            let i = self
-                .index
-                .partition_point(|&(s, _)| s <= slot)
-                .saturating_sub(1);
-            pos_start = self.index[i].1;
-            pos_end = if i + 1 < self.index.len() {
-                self.index[i + 1].1
-            } else {
-                self.n_overflow
-            };
-        }
-
+            let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1);
+            let start = self.index[i].1;
+            let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow };
+            (start, end)
+        };
        let mut lo = pos_start;
        let mut hi = pos_end;
        while lo < hi {
            let mid = lo + (hi - lo) / 2;
            match self.data_slot(mid).cmp(&slot) {
-                std::cmp::Ordering::Equal => return self.data_value(mid),
-                std::cmp::Ordering::Less => lo = mid + 1,
+                std::cmp::Ordering::Equal   => return self.data_value(mid),
+                std::cmp::Ordering::Less    => lo = mid + 1,
                std::cmp::Ordering::Greater => hi = mid,
            }
        }
@@ -119,140 +80,91 @@ impl PersistentCompactIntVec {
    }

    #[inline]
-    /// Returns the slot at the given index in the overflow region.
    fn data_slot(&self, i: usize) -> usize {
        let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE;
        u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize
    }

    #[inline]
-    /// Returns the value at the given index in the overflow region.
    fn data_value(&self, i: usize) -> u32 {
        let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8;
        u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap())
    }

-    #[inline]
-    /// Returns the sum of all values in the compact int vector.
    pub fn sum(&self) -> u64 {
-        self.iter().map(|v| v as u64).sum()
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i)))
    }

-    #[inline]
-    /// Returns the Bray-Curtis distance between two compact int vectors.
+    pub fn count_nonzero(&self) -> u64 {
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        byte_count_nonzero(primary)
+    }
+
+    /// Lightweight zero-copy view — primary and overflow point into the mmap.
+    pub fn view(&self) -> IntSliceView<'_> {
+        let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n];
+        let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE];
+        IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n)
+    }
+
+    pub fn iter(&self) -> Iter<'_> {
+        Iter { pciv: self, slot: 0, overflow_pos: 0 }
+    }
+
+    // ── Distance methods ──────────────────────────────────────────────────────
+
    pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
        let sum_min = self.partial_bray_dist(other);
        let denom = self.sum() + other.sum();
-        if denom == 0 {
-            return 0.0;
-        }
-        1.0 - 2.0 * sum_min as f64 / denom as f64
+        if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
    }

-    /// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis.
-    /// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`.
    pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
-            .map(|(a, b)| a.min(b) as u64)
-            .sum()
+        self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
    }

-    /// Returns the relative frequency Bray-Curtis distance between two compact int vectors.
-    ///
-    /// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts.
    pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b);
-        1.0 - sum_min
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
    }

-    /// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors.
-    ///
-    /// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency
-    /// Bray-Curtis distance over a set of vector pairs.
-    ///
-    /// Arguments:
-    /// - `other`: the other compact int vector to compare with
-    /// - `sum_a`: the sum of the first vector's counts
-    /// - `sum_b`: the sum of the second vector's counts
-    ///
-    /// Returns the sum of the minimum relative frequencies at each index.
-    pub fn partial_relfreq_bray_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_min: f64 = self
-            .iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
            .map(|(a, b)| {
                let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
                let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
                pa.min(pb)
            })
-            .sum();
-        sum_min
+            .sum()
    }

-    /// Returns the euclidean distance between two compact int vectors.
    pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
        self.partial_euclidean_dist(other).sqrt()
    }

-    /// Returns the partial euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance
-    /// over a set of vector pairs.
-    ///
-    /// The result is the sum of the squared differences between corresponding elements of the two
-    /// vectors.
    pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
-            .map(|(a, b)| {
-                let d = a as f64 - b as f64;
-                d * d
-            })
+        self.iter().zip(other.iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
            .sum()
    }

-    /// Returns the relative frequency euclidean distance between two compact int vectors.
-    ///
-    /// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts.
    pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        self.partial_relfreq_euclidean_dist(other, sum_a, sum_b)
-            .sqrt()
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
    }

-    /// Returns the partial relative frequency euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency
-    /// euclidean distance over a set of vector pairs.
-    pub fn partial_relfreq_euclidean_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
            .map(|(a, b)| {
                let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 };
                let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 };
@@ -262,46 +174,19 @@ impl PersistentCompactIntVec {
            .sum()
    }

-    /// Returns the Euclidean distance between two compact int vectors using the Hellinger transform.
-    ///
-    /// The Hellinger transform is applied to the raw counts of each vector, and the result is
-    /// the Euclidean distance between the transformed vectors. The Hellinger transform is defined
-    /// as the square root of the relative frequencies.
    pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
-        let sum_a = self.sum() as f64;
-        let sum_b = other.sum() as f64;
-        if sum_a == 0.0 && sum_b == 0.0 {
-            return 0.0;
-        }
-        self.partial_hellinger_euclidean_dist(other, sum_a, sum_b)
-            .sqrt()
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
    }

-    /// Returns the partial Hellinger Euclidean distance between two compact int vectors.
-    ///
-    /// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger
-    /// Euclidean distance over a set of vector pairs.
-    pub fn partial_hellinger_euclidean_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        sum_a: f64,
-        sum_b: f64,
-    ) -> f64 {
+    pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 {
        assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
            .map(|(a, b)| {
-                let pa = if sum_a > 0.0 {
-                    (a as f64 / sum_a).sqrt()
-                } else {
-                    0.0
-                };
-                let pb = if sum_b > 0.0 {
-                    (b as f64 / sum_b).sqrt()
-                } else {
-                    0.0
-                };
+                let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 };
+                let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 };
                let d = pa - pb;
                d * d
            })
@@ -313,22 +198,13 @@ impl PersistentCompactIntVec {
    }

    pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 {
-        assert_eq!(self.n, other.len(), "length mismatch");
        let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold);
-        if union == 0 {
-            return 0.0;
-        }
-        1.0 - intersection as f64 / union as f64
+        if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 }
    }

-    pub fn partial_threshold_jaccard_dist(
-        &self,
-        other: &PersistentCompactIntVec,
-        threshold: u32,
-    ) -> (u64, u64) {
+    pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) {
        assert_eq!(self.n, other.len(), "length mismatch");
-        self.iter()
-            .zip(other.iter())
+        self.iter().zip(other.iter())
            .fold((0u64, 0u64), |(inter, uni), (a, b)| {
                let ap = a >= threshold;
                let bp = b >= threshold;
@@ -339,23 +215,12 @@ impl PersistentCompactIntVec {
    pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 {
        self.threshold_jaccard_dist(other, 1)
    }
-
-    pub fn iter(&self) -> Iter<'_> {
-        Iter {
-            pciv: self,
-            slot: 0,
-            overflow_pos: 0,
-        }
-    }
 }

 impl<'a> IntoIterator for &'a PersistentCompactIntVec {
    type Item = u32;
    type IntoIter = Iter<'a>;
-
-    fn into_iter(self) -> Iter<'a> {
-        self.iter()
-    }
+    fn into_iter(self) -> Iter<'a> { self.iter() }
 }

 pub struct Iter<'a> {
@@ -370,9 +235,7 @@ impl Iterator for Iter<'_> {
    type Item = u32;

    fn next(&mut self) -> Option<u32> {
-        if self.slot >= self.pciv.n {
-            return None;
-        }
+        if self.slot >= self.pciv.n { return None; }
        let v = self.pciv.mmap[self.pciv.primary_offset + self.slot];
        self.slot += 1;
        if v < 255 {
@@ -0,0 +1,111 @@
+use std::io;
+use std::path::Path;
+
+use tempfile::TempDir;
+
+use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder};
+use crate::views::{BitSliceIter, BitSliceView, IntSliceView};
+
+// ── TempBitVec — frozen read-only, auto-deleted on drop ──────────────────────
+
+pub struct TempBitVec {
+    vec: PersistentBitVec,
+    // Dropped after `vec` (field order), so the mmap is released before the
+    // temp directory is deleted.
+    _temp: TempDir,
+}
+
+impl TempBitVec {
+    pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentBitVec> {
+        std::fs::copy(self.vec.path(), path)?;
+        PersistentBitVec::open(path)
+    }
+
+    pub fn len(&self) -> usize {
+        self.vec.len()
+    }
+    pub fn is_empty(&self) -> bool {
+        self.vec.is_empty()
+    }
+    pub fn get(&self, slot: usize) -> bool {
+        self.vec.get(slot)
+    }
+    pub fn count_ones(&self) -> u64 {
+        self.vec.count_ones()
+    }
+    pub fn view(&self) -> BitSliceView<'_> {
+        self.vec.view()
+    }
+    pub fn iter(&self) -> BitSliceIter<'_> {
+        self.view().iter()
+    }
+}
+
+// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
+
+pub struct TempBitVecBuilder {
+    builder: PersistentBitVecBuilder,
+    temp: TempDir,
+}
+
+impl TempBitVecBuilder {
+    pub fn new(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pbiv");
+        let builder = PersistentBitVecBuilder::new(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
+    pub fn new_ones(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pbiv");
+        let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
+    pub fn freeze(self) -> io::Result<TempBitVec> {
+        let Self { builder, temp } = self;
+        let vec = builder.finish()?;
+        Ok(TempBitVec { vec, _temp: temp })
+    }
+
+    pub fn set(&mut self, slot: usize, value: bool) {
+        self.builder.set(slot, value);
+    }
+
+    pub fn view(&self) -> BitSliceView<'_> {
+        self.builder.view()
+    }
+
+    pub fn or(&mut self, other: BitSliceView<'_>) {
+        self.builder.or(other);
+    }
+
+    pub fn and(&mut self, other: BitSliceView<'_>) {
+        self.builder.and(other);
+    }
+
+    pub fn xor(&mut self, other: BitSliceView<'_>) {
+        self.builder.xor(other);
+    }
+
+    pub fn not(&mut self) {
+        self.builder.not();
+    }
+
+    pub fn copy_from(&mut self, src: BitSliceView<'_>) {
+        self.builder.copy_from(src);
+    }
+
+    pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.or_where(col, pred);
+    }
+
+    pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.and_where(col, pred);
+    }
+
+    pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.xor_where(col, pred);
+    }
+}
@@ -0,0 +1,89 @@
+use std::io;
+use std::path::Path;
+
+use tempfile::TempDir;
+
+use crate::builder::PersistentCompactIntVecBuilder;
+use crate::reader::PersistentCompactIntVec;
+use crate::views::{BitSliceView, IntSliceView};
+
+// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ────────────────
+
+pub struct TempCompactIntVec {
+    vec:   PersistentCompactIntVec,
+    // Dropped after `vec` (field order), so the mmap is released before the
+    // temp directory is deleted.
+    _temp: TempDir,
+}
+
+impl TempCompactIntVec {
+    pub fn make_persistent(&self, path: &Path) -> io::Result<PersistentCompactIntVec> {
+        std::fs::copy(self.vec.path(), path)?;
+        PersistentCompactIntVec::open(path)
+    }
+
+    pub fn len(&self)      -> usize { self.vec.len() }
+    pub fn is_empty(&self) -> bool  { self.vec.is_empty() }
+    pub fn get(&self, slot: usize) -> u32  { self.vec.get(slot) }
+    pub fn sum(&self)      -> u64   { self.vec.sum() }
+    pub fn view(&self)     -> IntSliceView<'_> { self.vec.view() }
+    pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() }
+}
+
+// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
+
+pub struct TempCompactIntVecBuilder {
+    builder: PersistentCompactIntVecBuilder,
+    temp:    TempDir,
+}
+
+impl TempCompactIntVecBuilder {
+    pub fn new(n: usize) -> io::Result<Self> {
+        let temp = TempDir::new()?;
+        let path = temp.path().join("data.pciv");
+        let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
+        Ok(Self { builder, temp })
+    }
+
+    pub fn freeze(self) -> io::Result<TempCompactIntVec> {
+        let Self { builder, temp } = self;
+        let vec = builder.finish()?;
+        Ok(TempCompactIntVec { vec, _temp: temp })
+    }
+
+    pub fn n(&self) -> usize { self.builder.len() }
+
+    pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
+    pub fn get(&self, slot: usize) -> u32           { self.builder.get(slot) }
+
+    pub fn primary_bytes(&self)         -> &[u8]      { self.builder.primary_bytes() }
+    pub fn primary_bytes_mut(&mut self) -> &mut [u8]  { self.builder.primary_bytes_mut() }
+
+    pub fn inc_present(&mut self, col: BitSliceView<'_>) {
+        self.builder.inc_present(col);
+    }
+
+    pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
+        self.builder.inc_present_fast(col);
+    }
+
+    pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.inc_predicate(col, pred);
+    }
+
+    pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
+        self.builder.inc_predicate_fast(col, pred);
+    }
+
+    pub fn add(&mut self, other: IntSliceView<'_>) {
+        self.builder.add(other);
+    }
+
+    pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
+        self.builder.mask_with(mask);
+    }
+
+    pub fn min(&mut self, other: IntSliceView<'_>)  { self.builder.min(other); }
+    pub fn max(&mut self, other: IntSliceView<'_>)  { self.builder.max(other); }
+    pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
+}
@@ -1,6 +1,6 @@
 use tempfile::tempdir;

-use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder};
+use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder};
 use crate::traits::BitPartials;

 fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
@@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() {
    let full    = m.hamming_dist_matrix();
    assert_eq!(partial, full);
 }
+
+// ── col_view on Packed ────────────────────────────────────────────────────────
+
+#[test]
+fn col_view_packed_values() {
+    let (dir, _) = make_matrix(&[
+        &[true, false, true, true],
+        &[false, true, false, true],
+    ]);
+    pack_bit_matrix(&dir.path().join("presence")).unwrap();
+    let m = PersistentBitMatrix::open(dir.path()).unwrap();
+
+    // col 0: [T, F, T, T]
+    let v0 = m.col_view(0);
+    assert_eq!(v0.len(), 4);
+    assert_eq!(v0.get(0), true);
+    assert_eq!(v0.get(1), false);
+    assert_eq!(v0.get(2), true);
+    assert_eq!(v0.get(3), true);
+    assert_eq!(v0.count_ones(), 3);
+
+    // col 1: [F, T, F, T]
+    let v1 = m.col_view(1);
+    assert_eq!(v1.get(0), false);
+    assert_eq!(v1.get(1), true);
+    assert_eq!(v1.get(2), false);
+    assert_eq!(v1.get(3), true);
+    assert_eq!(v1.count_ones(), 2);
+}
+
+#[test]
+fn col_view_packed_matches_columnar() {
+    let data: &[&[bool]] = &[
+        &[true, false, true, false, true, true, false, true],
+        &[false, false, true, true, false, true, true, false],
+        &[true, true, true, false, false, false, true, true],
+    ];
+    let (dir_col, m_col) = make_matrix(data);
+    let (dir_pack, _)    = make_matrix(data);
+    pack_bit_matrix(&dir_pack.path().join("presence")).unwrap();
+    let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap();
+
+    for c in 0..data.len() {
+        let col_ref  = m_col.col(c);
+        let col_view = m_pack.col_view(c);
+        assert_eq!(col_view.len(), col_ref.len(), "col={c} len");
+        for s in 0..col_ref.len() {
+            assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
+        }
+        assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones");
+        assert_eq!(col_view.words(), col_ref.words(), "col={c} words");
+    }
+    drop(dir_col);
+}
@@ -77,7 +77,7 @@ fn op_and() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pbiv");
    let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.and(&rb);
+    b.and(rb.view());
    b.close().unwrap();
    let r = PersistentBitVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, false, false, false]);
@@ -90,7 +90,7 @@ fn op_or() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pbiv");
    let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.or(&rb);
+    b.or(rb.view());
    b.close().unwrap();
    let r = PersistentBitVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![true, true, true, false]);
@@ -103,7 +103,7 @@ fn op_xor() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pbiv");
    let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap();
-    b.xor(&rb);
+    b.xor(rb.view());
    b.close().unwrap();
    let r = PersistentBitVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![false, true, true, false]);
@@ -0,0 +1,223 @@
+use tempfile::tempdir;
+
+use crate::{
+    ColGroup, MatrixGroupOps,
+    PersistentBitMatrix, PersistentBitMatrixBuilder,
+    PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder,
+};
+use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder};
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
+    let n = cols.first().map_or(0, |c| c.len());
+    let dir = tempdir().unwrap();
+    let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap();
+    for &col in cols {
+        let mut cb = b.add_col().unwrap();
+        for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+        cb.close().unwrap();
+    }
+    b.close().unwrap();
+    let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
+    (dir, m)
+}
+
+fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) {
+    let n = cols.first().map_or(0, |c| c.len());
+    let dir = tempdir().unwrap();
+    let presence = dir.path().join("presence");
+    let mut b = PersistentBitMatrixBuilder::new(n, &presence).unwrap();
+    for &col in cols {
+        let mut cb = b.add_col().unwrap();
+        for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); }
+        cb.close().unwrap();
+    }
+    b.close().unwrap();
+    let m = PersistentBitMatrix::open(dir.path()).unwrap();
+    (dir, m)
+}
+
+// ── IntMatrix: partial_group_sum ──────────────────────────────────────────────
+
+#[test]
+fn int_partial_group_sum_basic() {
+    // col0=[1,2,3], col1=[10,20,30], col2=[100,0,5]
+    // group {0,2}: sum = [101, 2, 8]
+    let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]);
+    let g = ColGroup::new("g", vec![0, 2]);
+    let result = m.partial_group_sum(&g).unwrap();
+    assert_eq!(result.get(0), 101);
+    assert_eq!(result.get(1), 2);
+    assert_eq!(result.get(2), 8);
+}
+
+#[test]
+fn int_partial_group_sum_with_overflow() {
+    // col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400]
+    let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]);
+    let g = ColGroup::new("g", vec![0, 1]);
+    let result = m.partial_group_sum(&g).unwrap();
+    assert_eq!(result.get(0), 500);
+    assert_eq!(result.get(1), 400);
+    assert_eq!(result.sum(), 900);
+}
+
+// ── IntMatrix: partial_group_presence_count ───────────────────────────────────
+
+#[test]
+fn int_partial_group_presence_count() {
+    // col0=[5,1,0,3], col1=[2,0,4,3], col2=[0,3,1,0]
+    // threshold=2: col0: [T,F,F,T], col1: [T,F,T,T], col2: [F,T,F,F]
+    // group {0,1,2}: counts = [2, 1, 1, 2]
+    let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 2).unwrap();
+    assert_eq!(result.get(0), 2);
+    assert_eq!(result.get(1), 1);
+    assert_eq!(result.get(2), 1);
+    assert_eq!(result.get(3), 2);
+}
+
+#[test]
+fn int_partial_group_presence_count_with_overflow() {
+    // col0=[300,0,10], col1=[0,400,10], col2=[1,1,10]
+    // threshold=5: col0: [T,F,T], col1: [F,T,T], col2: [F,F,T]
+    // group {0,1,2}: counts = [1, 1, 3]
+    let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 5).unwrap();
+    assert_eq!(result.get(0), 1);
+    assert_eq!(result.get(1), 1);
+    assert_eq!(result.get(2), 3);
+}
+
+// ── IntMatrix: partial_group_any ──────────────────────────────────────────────
+
+#[test]
+fn int_partial_group_any() {
+    // col0=[0,3,0,1], col1=[2,0,0,0], col2=[0,0,5,0]
+    // threshold=2: col0: [F,T,F,F], col1: [T,F,F,F], col2: [F,F,T,F]
+    // group {0,1,2}: any = [T, T, T, F]
+    let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_any(&g, 2).unwrap();
+    assert_eq!(result.get(0), true);
+    assert_eq!(result.get(1), true);
+    assert_eq!(result.get(2), true);
+    assert_eq!(result.get(3), false);
+}
+
+// ── IntMatrix: mask_with ──────────────────────────────────────────────────────
+
+#[test]
+fn mask_with_zeros_selected_slots() {
+    // count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0]
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
+    v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40);
+    let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap();
+    mask.set(0, true); mask.set(2, true);
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 10);
+    assert_eq!(r.get(1), 0);
+    assert_eq!(r.get(2), 30);
+    assert_eq!(r.get(3), 0);
+}
+
+#[test]
+fn mask_with_overflow_slot_zeroed() {
+    // overflow slot (value 500) masked out → removed from overflow, primary=0
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap();
+    v.set(0, 10); v.set(1, 500); v.set(2, 5);
+    let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap();
+    mask.set(0, true); mask.set(2, true);  // slot 1 masked out
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 10);
+    assert_eq!(r.get(1), 0);
+    assert_eq!(r.get(2), 5);
+    let ov: Vec<_> = r.view().overflow_entries().collect();
+    assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone");
+}
+
+#[test]
+fn mask_with_all_ones_is_noop() {
+    let dir = tempdir().unwrap();
+    let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap();
+    v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42);
+    let mask = PersistentBitVecBuilder::new_ones(4, &dir.path().join("m.pbiv")).unwrap();
+    v.mask_with(mask.view());
+    v.close().unwrap();
+    let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap();
+    assert_eq!(r.get(0), 300);
+    assert_eq!(r.get(1), 1);
+    assert_eq!(r.get(2), 0);
+    assert_eq!(r.get(3), 42);
+}
+
+// ── BitMatrix: partial_group_presence_count ───────────────────────────────────
+
+#[test]
+fn bit_partial_group_presence_count() {
+    // col0=[T,F,T,F], col1=[T,T,F,F], col2=[F,T,T,F]
+    // group {0,1,2}: counts = [2, 2, 2, 0]
+    let (_d, m) = make_bit_matrix(&[
+        &[true, false, true,  false],
+        &[true, true,  false, false],
+        &[false,true,  true,  false],
+    ]);
+    let g = ColGroup::new("g", vec![0, 1, 2]);
+    let result = m.partial_group_presence_count(&g, 1).unwrap();
+    assert_eq!(result.get(0), 2);
+    assert_eq!(result.get(1), 2);
+    assert_eq!(result.get(2), 2);
+    assert_eq!(result.get(3), 0);
+}
+
+// ── BitMatrix: partial_group_any ──────────────────────────────────────────────
+
+#[test]
+fn bit_partial_group_any() {
+    // col0=[T,F,F], col1=[F,F,T], group {0,1}: any = [T, F, T]
+    let (_d, m) = make_bit_matrix(&[
+        &[true, false, false],
+        &[false, false, true],
+    ]);
+    let g = ColGroup::new("g", vec![0, 1]);
+    let result = m.partial_group_any(&g, 1).unwrap();
+    assert_eq!(result.get(0), true);
+    assert_eq!(result.get(1), false);
+    assert_eq!(result.get(2), true);
+}
+
+// ── Composition: partial results are additive ─────────────────────────────────
+
+#[test]
+fn int_presence_count_additive_across_split() {
+    // Simulate two partitions (different kmer ranges) whose counts should add.
+    // Global data for col0: [5,1,0,3,2], col1: [2,0,4,3,1] — threshold=2
+    // Split: partition A = slots 0..2, partition B = slots 2..5
+    let data_a: &[&[u32]] = &[&[5, 1], &[2, 0]];
+    let data_b: &[&[u32]] = &[&[0, 3, 2], &[4, 3, 1]];
+    let (_da, ma) = make_int_matrix(data_a);
+    let (_db, mb) = make_int_matrix(data_b);
+    let g = ColGroup::new("g", vec![0, 1]);
+
+    let pa = ma.partial_group_presence_count(&g, 2).unwrap();
+    let pb = mb.partial_group_presence_count(&g, 2).unwrap();
+
+    // Concatenate by adding (disjoint kmer ranges — here we just verify
+    // individual results match the expected per-partition counts).
+    // partition A: col0=[5≥2,1<2]=[T,F], col1=[2≥2,0<2]=[T,F] → [2, 0]
+    assert_eq!(pa.get(0), 2);
+    assert_eq!(pa.get(1), 0);
+    // partition B: col0=[0<2,3≥2,2≥2]=[F,T,T], col1=[4≥2,3≥2,1<2]=[T,T,F] → [1, 2, 1]
+    assert_eq!(pb.get(0), 1);
+    assert_eq!(pb.get(1), 2);
+    assert_eq!(pb.get(2), 1);
+}
@@ -1,6 +1,6 @@
 use tempfile::tempdir;

-use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
+use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder};
 use crate::traits::CountPartials;

 fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) {
@@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() {
    }
 }

+#[test]
+fn col_view_packed_values() {
+    // Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view().
+    let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]);
+    pack_compact_int_matrix(&dir.path().join("counts")).unwrap();
+    let m = PersistentCompactIntMatrix::open(dir.path()).unwrap();
+
+    // col 0: [10, 300, 500] — two overflow slots
+    let v0 = m.col_view(0);
+    assert_eq!(v0.get(0), 10);
+    assert_eq!(v0.get(1), 300);
+    assert_eq!(v0.get(2), 500);
+    assert_eq!(v0.sum(), 810);
+    assert_eq!(v0.count_nonzero(), 3);
+    let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect();
+    ov0.sort_unstable_by_key(|&(s, _)| s);
+    assert_eq!(ov0, vec![(1, 300), (2, 500)]);
+
+    // col 1: [200, 50, 1000] — one overflow slot
+    let v1 = m.col_view(1);
+    assert_eq!(v1.get(0), 200);
+    assert_eq!(v1.get(1), 50);
+    assert_eq!(v1.get(2), 1000);
+    let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect();
+    ov1.sort_unstable_by_key(|&(s, _)| s);
+    assert_eq!(ov1, vec![(2, 1000)]);
+}
+
+#[test]
+fn col_view_packed_matches_columnar() {
+    // Same data, compare col_view() on Packed against col() on Columnar slot-by-slot.
+    let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]];
+    let (dir_col, m_col) = make_matrix(data);
+    // Re-build in a separate dir so we can pack without touching m_col's files.
+    let (dir_pack, _) = make_matrix(data);
+    pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap();
+    let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap();
+
+    for c in 0..data.len() {
+        let col_ref  = m_col.col(c);
+        let col_view = m_pack.col_view(c);
+        assert_eq!(col_view.len(), col_ref.len());
+        for s in 0..col_ref.len() {
+            assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}");
+        }
+        assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum");
+        let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect();
+        let mut ov_ref:  Vec<(usize, u32)> = col_ref.view().overflow_entries().collect();
+        ov_view.sort_unstable_by_key(|&(s, _)| s);
+        ov_ref.sort_unstable_by_key(|&(s, _)| s);
+        assert_eq!(ov_view, ov_ref, "col={c} overflow_entries");
+    }
+    drop(dir_col);
+}
+
 #[test]
 fn partial_relfreq_bray_additive_across_split() {
    // Split rows [1,2,3,4,5] between two matrices; partial sums should add up.
@@ -1,5 +1,6 @@
 mod bitmatrix;
 mod bitvec;
+mod colgroup;
 mod intmatrix;

 use tempfile::tempdir;
@@ -169,7 +170,7 @@ fn combine_min() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pciv");
    let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.min(&rb);
+    b.min(rb.view());
    b.close().unwrap();
    let r = PersistentCompactIntVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 100, 0, 800]);
@@ -182,7 +183,7 @@ fn combine_max() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pciv");
    let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.max(&rb);
+    b.max(rb.view());
    b.close().unwrap();
    let r = PersistentCompactIntVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![20, 300, 500, 1000]);
@@ -195,7 +196,7 @@ fn combine_add() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pciv");
    let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.add(&rb);
+    b.add(rb.view());
    b.close().unwrap();
    let r = PersistentCompactIntVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![30, 300, 5, 101]);
@@ -220,7 +221,7 @@ fn combine_diff() {
    let dir = tempdir().unwrap();
    let path = dir.path().join("out.pciv");
    let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap();
-    b.diff(&rb);
+    b.diff(rb.view());
    b.close().unwrap();
    let r = PersistentCompactIntVec::open(&path).unwrap();
    assert_eq!(r.iter().collect::<Vec<_>>(), vec![10, 700, 0, 0]);
@@ -1,9 +1,17 @@
 use ndarray::{Array1, Array2};

-/// Column-level weight statistic — total count or presence count per column.
+// ── Column-level weight statistic — total count or presence count per column.
 /// Additive across layers and partitions; used as denominator in normalised distances.
+///
+/// `partial_kmer_counts` returns the number of **distinct k-mers** present per
+/// column (presence = 1 entries; count > 0 entries). For presence matrices this
+/// equals `col_weights`; for count matrices it differs (count_nonzero vs sum).
 pub trait ColumnWeights: Send + Sync {
    fn col_weights(&self) -> Array1<u64>;
+
+    fn partial_kmer_counts(&self) -> Array1<u64> {
+        self.col_weights()
+    }
 }

 /// Partial distance matrices for count-based data (`PersistentCompactIntMatrix`).
@@ -0,0 +1,278 @@
+use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry};
+
+// ── BitSliceView ──────────────────────────────────────────────────────────────
+
+/// Lightweight, copy-able read-only view over a u64 word array.
+/// Bit `i` is in `words[i >> 6]` at position `i & 63`.  Padding bits are zero.
+#[derive(Clone, Copy)]
+pub struct BitSliceView<'a> {
+    pub(crate) words: &'a [u64],
+    pub(crate) n:     usize,
+}
+
+impl<'a> BitSliceView<'a> {
+    #[inline]
+    pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } }
+
+    pub fn len(&self)      -> usize  { self.n }
+    pub fn is_empty(&self) -> bool   { self.n == 0 }
+    pub fn words(&self)    -> &'a [u64] { self.words }
+
+    #[inline]
+    pub fn get(&self, slot: usize) -> bool {
+        (self.words[slot >> 6] >> (slot & 63)) & 1 != 0
+    }
+
+    pub fn count_ones(&self) -> u64 {
+        self.words.iter().map(|w| w.count_ones() as u64).sum()
+    }
+    pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() }
+
+    pub fn iter(&self) -> BitSliceIter<'a> {
+        BitSliceIter { words: self.words, slot: 0, n: self.n }
+    }
+
+    pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) {
+        assert_eq!(self.n, other.n, "BitSliceView length mismatch");
+        self.words.iter().zip(other.words)
+            .fold((0u64, 0u64), |(i, u), (&a, &b)| {
+                (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64)
+            })
+    }
+
+    pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 {
+        let (inter, union) = self.partial_jaccard_dist(other);
+        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
+    }
+
+    pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 {
+        assert_eq!(self.n, other.n, "BitSliceView length mismatch");
+        self.words.iter().zip(other.words)
+            .map(|(&a, &b)| (a ^ b).count_ones() as u64)
+            .sum()
+    }
+}
+
+// ── BitSliceIter ──────────────────────────────────────────────────────────────
+
+pub struct BitSliceIter<'a> {
+    words: &'a [u64],
+    slot:  usize,
+    n:     usize,
+}
+
+impl Iterator for BitSliceIter<'_> {
+    type Item = bool;
+    fn next(&mut self) -> Option<bool> {
+        if self.slot >= self.n { return None; }
+        let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0;
+        self.slot += 1;
+        Some(v)
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+impl ExactSizeIterator for BitSliceIter<'_> {}
+
+// ── IntSliceView ──────────────────────────────────────────────────────────────
+
+/// Lightweight, copy-able read-only view over a compact-int primary array plus
+/// its sorted raw overflow bytes.  Zero-copy: all data lives in the caller's mmap.
+#[derive(Clone, Copy)]
+pub struct IntSliceView<'a> {
+    pub(crate) primary:      &'a [u8],
+    pub(crate) overflow_raw: &'a [u8],   // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot
+    pub(crate) n_overflow:   usize,
+    pub(crate) n:            usize,
+}
+
+impl<'a> IntSliceView<'a> {
+    #[inline]
+    pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self {
+        Self { primary, overflow_raw, n_overflow, n }
+    }
+
+    pub fn len(&self)        -> usize    { self.n }
+    pub fn is_empty(&self)   -> bool     { self.n == 0 }
+    pub fn primary_bytes(&self) -> &'a [u8] { self.primary }
+    pub fn n_overflow(&self) -> usize    { self.n_overflow }
+
+    pub fn overflow_entries(&self) -> impl Iterator<Item = (usize, u32)> + 'a {
+        let raw  = self.overflow_raw;
+        let n_ov = self.n_overflow;
+        (0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i))
+    }
+
+    /// O(log n_overflow) via binary search (overflow is always sorted by slot).
+    pub fn get(&self, slot: usize) -> u32 {
+        let b = self.primary[slot];
+        if b < 255 { return b as u32; }
+        let mut lo = 0usize;
+        let mut hi = self.n_overflow;
+        while lo < hi {
+            let mid = lo + (hi - lo) / 2;
+            let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid);
+            match s.cmp(&slot) {
+                std::cmp::Ordering::Equal   => return v,
+                std::cmp::Ordering::Less    => lo = mid + 1,
+                std::cmp::Ordering::Greater => hi = mid,
+            }
+        }
+        panic!("slot {slot} marked overflow but not found")
+    }
+
+    /// Sequential merge scan: yields all n values in slot order.
+    pub fn iter(&self) -> IntSliceViewIter<'a> {
+        IntSliceViewIter {
+            primary:      self.primary,
+            overflow_raw: self.overflow_raw,
+            slot:         0,
+            overflow_pos: 0,
+            n:            self.n,
+        }
+    }
+
+    pub fn sum(&self) -> u64 {
+        byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v))
+    }
+
+    pub fn count_nonzero(&self) -> u64 {
+        byte_count_nonzero(self.primary)
+    }
+
+    // ── Distance methods ──────────────────────────────────────────────────────
+
+    pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum()
+    }
+
+    pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sum_min = self.partial_bray_dist(other);
+        let denom = self.sum() + other.sum();
+        if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 }
+    }
+
+    pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
+                let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
+                pa.min(pb)
+            })
+            .sum()
+    }
+
+    pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        1.0 - self.partial_relfreq_bray_dist(other, sa, sb)
+    }
+
+    pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| { let d = a as f64 - b as f64; d * d })
+            .sum()
+    }
+
+    pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.partial_euclidean_dist(other).sqrt()
+    }
+
+    pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 };
+                let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 };
+                let d = pa - pb;
+                d * d
+            })
+            .sum()
+    }
+
+    pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt()
+    }
+
+    pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .map(|(a, b)| {
+                let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 };
+                let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 };
+                let d = pa - pb;
+                d * d
+            })
+            .sum()
+    }
+
+    pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 {
+        let sa = self.sum() as f64;
+        let sb = other.sum() as f64;
+        if sa == 0.0 && sb == 0.0 { return 0.0; }
+        self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt()
+    }
+
+    pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2
+    }
+
+    pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) {
+        assert_eq!(self.n, other.n, "length mismatch");
+        self.iter().zip(other.iter())
+            .fold((0u64, 0u64), |(inter, uni), (a, b)| {
+                let ap = a >= threshold;
+                let bp = b >= threshold;
+                (inter + (ap & bp) as u64, uni + (ap | bp) as u64)
+            })
+    }
+
+    pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 {
+        let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold);
+        if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 }
+    }
+
+    pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 {
+        self.threshold_jaccard_dist(other, 1)
+    }
+}
+
+// ── IntSliceViewIter ──────────────────────────────────────────────────────────
+
+pub struct IntSliceViewIter<'a> {
+    primary:      &'a [u8],
+    overflow_raw: &'a [u8],
+    slot:         usize,
+    overflow_pos: usize,
+    n:            usize,
+}
+
+impl Iterator for IntSliceViewIter<'_> {
+    type Item = u32;
+    fn next(&mut self) -> Option<u32> {
+        if self.slot >= self.n { return None; }
+        let v = self.primary[self.slot];
+        self.slot += 1;
+        if v < 255 {
+            Some(v as u32)
+        } else {
+            let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos);
+            self.overflow_pos += 1;
+            Some(val)
+        }
+    }
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let rem = self.n - self.slot;
+        (rem, Some(rem))
+    }
+}
+impl ExactSizeIterator for IntSliceViewIter<'_> {}
@@ -8,7 +8,8 @@ obikseq = { path = "../obikseq" }
 obifastwrite = { path = "../obifastwrite" }
 ahash = "0.8"
 hashbrown = { version = "0.14", features = ["rayon"] }
-rayon     = "1"
+rayon              = "1"
+crossbeam-channel  = "0.5"
 xxhash-rust = { version = "0.8.15", features = ["xxh3", "const_xxh3"] }
 tracing    = "0.1"

@@ -1,12 +1,14 @@
 //use ahash::RandomState;
+use crossbeam_channel;
 use hashbrown::HashMap;
 use obikseq::k;
-use obikseq::{CanonicalKmer, Sequence};
+use obikseq::{CanonicalKmer, Sequence, Unitig};
+#[cfg(not(any(test, feature = "test-utils")))]
 use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
+use std::cell::RefCell;
 use std::fmt;
 use std::sync::atomic::{AtomicU8, Ordering};
 use xxhash_rust::xxh3::Xxh3Builder;
-use tracing::{debug, info};

 // ── Types ─────────────────────────────────────────────────────────────────────

@@ -96,26 +98,6 @@ impl Node {
        (self.0 >> 5) & 0b11
    }

-    /// Number of left neighbours.
-    pub fn n_left_neighbours(self) -> u8 {
-        if self.can_extend_left() {
-            1
-        } else {
-            let v = (self.0 >> 5) & 0b11;
-            v + (v != 0) as u8
-        }
-    }
-
-    /// Number of right neighbours.
-    pub fn n_right_neighbours(self) -> u8 {
-        if self.can_extend_right() {
-            1
-        } else {
-            let v = (self.0 >> 3) & 0b11;
-            v + (v != 0) as u8
-        }
-    }
-
    /// Marks the node as visited.
    #[inline]
    pub fn set_visited(&mut self) {
@@ -162,13 +144,17 @@ impl fmt::Display for Node {
        const NUC: [char; 4] = ['A', 'C', 'G', 'T'];
        let r = if self.can_extend_right() {
            format!("→{}", NUC[self.right_nuc() as usize])
+        } else if (self.0 >> 3) & 0b11 == 0 {
+            "→0".to_string()
        } else {
-            format!("→{}", self.n_right_neighbours())
+            "→≥2".to_string()
        };
        let l = if self.can_extend_left() {
            format!("←{}", NUC[self.left_nuc() as usize])
+        } else if (self.0 >> 5) & 0b11 == 0 {
+            "←0".to_string()
        } else {
-            format!("←{}", self.n_left_neighbours())
+            "←≥2".to_string()
        };
        let v = if self.is_visited() { "V" } else { "." };
        write!(f, "Node({r} {l} {v})")
@@ -192,8 +178,12 @@ impl WalkState {
    }

    pub fn reachable(&self, graph: &GraphDeBruijn) -> bool {
-        WalkState { kmer: self.kmer, node: self.node, direct: !self.direct }
-            .leavable(graph)
+        WalkState {
+            kmer: self.kmer,
+            node: self.node,
+            direct: !self.direct,
+        }
+        .leavable(graph)
    }

    pub fn walk(&self, graph: &GraphDeBruijn) -> Option<(WalkState, u8)> {
@@ -209,8 +199,19 @@ impl WalkState {
            if next_node.is_visited() {
                return None;
            }
-            let reachable = if dnext { next_node.can_extend_left() } else { next_node.can_extend_right() };
-            reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, nuc))
+            let reachable = if dnext {
+                next_node.can_extend_left()
+            } else {
+                next_node.can_extend_right()
+            };
+            reachable.then_some((
+                WalkState {
+                    kmer: cnext,
+                    node: next_node,
+                    direct: dnext,
+                },
+                nuc,
+            ))
        } else {
            if !self.node.can_extend_left() {
                return None;
@@ -223,8 +224,19 @@ impl WalkState {
            if next_node.is_visited() {
                return None;
            }
-            let reachable = if dnext { next_node.can_extend_right() } else { next_node.can_extend_left() };
-            reachable.then_some((WalkState { kmer: cnext, node: next_node, direct: dnext }, 3 - nuc))
+            let reachable = if dnext {
+                next_node.can_extend_right()
+            } else {
+                next_node.can_extend_left()
+            };
+            reachable.then_some((
+                WalkState {
+                    kmer: cnext,
+                    node: next_node,
+                    direct: dnext,
+                },
+                3 - nuc,
+            ))
        }
    }
 }
@@ -281,14 +293,13 @@ impl GraphDeBruijn {
            }
            let (rc, rn) = count_neighbors(&kmer.right_canonical_neighbors(), &self.nodes);
            let (lc, ln) = count_neighbors(&kmer.left_canonical_neighbors(), &self.nodes);
-            let mut node = Node(0); // reset all bits (visited=0, start=0)
+            let mut node = Node(0);
            node.set_right(rc, rn);
            node.set_left(lc, ln);
            atomic.store(node.0, Ordering::Relaxed);
        });

        // Pass 2: mark start nodes
-
        self.for_each_node(|kmer, atomic| {
            let mut node = Node(atomic.load(Ordering::Relaxed));
            if node.is_visited() {
@@ -336,14 +347,28 @@ impl GraphDeBruijn {
    }

    fn unitig_nucleotides(&self, kmer: CanonicalKmer, k: usize) -> Option<UnitigNucIter<'_>> {
-        let old = self.nodes.get(&kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
-        if old & IS_VISITED_MASK != 0 { return None; }
+        let old = self
+            .nodes
+            .get(&kmer)?
+            .fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
+        if old & IS_VISITED_MASK != 0 {
+            return None;
+        }
        let start = WalkState::new(kmer, Node(old), true);
        let next_step = start.walk(self).and_then(|(next_state, nuc)| {
-            let ext_old = self.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
+            let ext_old = self
+                .nodes
+                .get(&next_state.kmer)?
+                .fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
            (ext_old & IS_VISITED_MASK == 0).then_some((next_state, nuc))
        });
-        Some(UnitigNucIter { graph: self, start: kmer, pos: 0, k, next_step })
+        Some(UnitigNucIter {
+            graph: self,
+            start: kmer,
+            pos: 0,
+            k,
+            next_step,
+        })
    }

    pub fn for_each_unitig(&self, f: impl Fn(UnitigNucIter<'_>) + Sync) {
@@ -352,7 +377,6 @@ impl GraphDeBruijn {
        let n2 = std::sync::atomic::AtomicUsize::new(0);

        // Boucle unique : traiter les starts, recalculer les arités, recommencer
-        let mut pass = 0usize;
        loop {
            let n_new = std::sync::atomic::AtomicUsize::new(0);

@@ -360,7 +384,9 @@ impl GraphDeBruijn {
            self.nodes
                .par_iter()
                .filter_map(|(&kmer, atomic)| {
-                    Node(atomic.load(Ordering::Acquire)).is_start().then_some(kmer)
+                    Node(atomic.load(Ordering::Acquire))
+                        .is_start()
+                        .then_some(kmer)
                })
                .for_each(|kmer| {
                    if let Some(iter) = self.unitig_nucleotides(kmer, k) {
@@ -380,9 +406,7 @@ impl GraphDeBruijn {
            });

            let n = n_new.load(Ordering::Relaxed);
-            debug!("[for_each_unitig] pass {}: {} starts", pass, n);
            n_chains.fetch_add(n, Ordering::Relaxed);
-            pass += 1;
            if n == 0 {
                break;
            }
@@ -411,12 +435,6 @@ impl GraphDeBruijn {
            }
        }

-        info!(
-            chains = n_chains.load(Ordering::Relaxed),
-            phase2 = n2.load(Ordering::Relaxed),
-            total  = n_chains.load(Ordering::Relaxed) + n2.load(Ordering::Relaxed),
-            "unitig traversal complete"
-        );
    }

    /// Merge `other` into `self`.
@@ -460,19 +478,31 @@ impl GraphDeBruijn {
    pub fn try_for_each_unitig<E, F>(&self, f: F) -> Result<(), E>
    where
        E: Send,
-        F: FnMut(UnitigNucIter<'_>) -> Result<(), E> + Send,
+        F: FnMut(&Unitig) -> Result<(), E> + Send,
    {
-        let error = std::sync::Mutex::new(None::<E>);
-        let f = std::sync::Mutex::new(f);
-        self.for_each_unitig(|iter| {
-            if error.lock().unwrap().is_some() {
-                return;
-            }
-            if let Err(e) = f.lock().unwrap()(iter) {
-                *error.lock().unwrap() = Some(e);
-            }
-        });
-        error.into_inner().unwrap().map_or(Ok(()), Err)
+        thread_local! {
+            static BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(4096));
+        }
+        let (tx, rx) = crossbeam_channel::bounded::<Unitig>(rayon::current_num_threads() * 256);
+        std::thread::scope(|s| {
+            let writer = s.spawn(move || -> Result<(), E> {
+                let mut f = f;
+                for unitig in rx {
+                    f(&unitig)?;
+                }
+                Ok(())
+            });
+            self.for_each_unitig(|iter| {
+                BUF.with(|buf| {
+                    let mut buf = buf.borrow_mut();
+                    buf.clear();
+                    buf.extend(iter);
+                    tx.send(Unitig::from_nucleotides(&buf)).ok();
+                });
+            });
+            drop(tx);
+            writer.join().expect("writer thread panicked")
+        })
    }

    pub fn len(&self) -> usize {
@@ -504,7 +534,11 @@ impl Iterator for UnitigNucIter<'_> {
            Some(nuc)
        } else if let Some((state, nuc)) = self.next_step.take() {
            self.next_step = state.walk(self.graph).and_then(|(next_state, next_nuc)| {
-                let old = self.graph.nodes.get(&next_state.kmer)?.fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
+                let old = self
+                    .graph
+                    .nodes
+                    .get(&next_state.kmer)?
+                    .fetch_or(IS_VISITED_MASK, Ordering::AcqRel);
                (old & IS_VISITED_MASK == 0).then_some((next_state, next_nuc))
            });
            Some(nuc)
@@ -527,22 +561,23 @@ fn count_neighbors(
    nodes: &FastHashMap<CanonicalKmer, AtomicU8>,
 ) -> (u8, Option<u8>) {
    let mut count = 0u8;
-    let mut first = None;
+    let mut nuc = 0u8;
    for (i, neighbour) in neighbors.iter().enumerate() {
        if let Some(a) = nodes.get(neighbour) {
            if Node(a.load(Ordering::Relaxed)).is_visited() {
                continue;
            }
+            nuc = i as u8;
            count += 1;
-            if first.is_none() {
-                first = Some(i as u8);
+            if count >= 2 {
+                return (2, None);
            }
        }
    }
    if count == 1 {
-        (1, first)
+        (1, Some(nuc))
    } else {
-        (count, None)
+        (0, None)
    }
 }

@@ -24,8 +24,8 @@ fn canonical_kmers(seq: &[u8]) -> Vec<CanonicalKmer> {

 fn collect_unitigs(g: &GraphDeBruijn) -> Vec<Unitig> {
    let mut unitigs = Vec::new();
-    g.try_for_each_unitig(|nuc_iter| -> Result<(), std::convert::Infallible> {
-        unitigs.push(nuc_iter.collect());
+    g.try_for_each_unitig(|unitig| -> Result<(), std::convert::Infallible> {
+        unitigs.push(unitig.clone());
        Ok(())
    })
    .unwrap();
@@ -11,8 +11,10 @@ obisys           = { path = "../obisys" }
 obicompactvec    = { path = "../obicompactvec" }
 obilayeredmap    = { path = "../obilayeredmap" }
 ndarray          = "0.16"
-rayon            = "1"
+rayon             = "1"
+crossbeam-channel = "0.5"
 serde            = { version = "1", features = ["derive"] }
 serde_json       = "1"
 indicatif        = "0.17"
 tracing          = "0.1.44"
+hwlocality       = { version = "1.0.0-alpha.11", features = ["vendored"] }
@@ -1,4 +1,7 @@
 use std::io::Write;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use rayon::prelude::*;

 use crate::error::{OKIError, OKIResult};
 use crate::index::KmerIndex;
@@ -13,14 +16,19 @@ impl KmerIndex {
    /// `force_presence` overrides `with_counts`: even if the index stores counts,
    /// the output uses 0/1 presence columns.
    ///
+    /// Partitions are scanned in parallel; each partition buffers its output locally
+    /// before the main thread writes the chunks in partition order.
+    ///
    /// The caller must have set the global kmer length (`obikseq::set_k`) before
    /// calling this method.
-    pub fn dump<W: Write>(
+    pub fn dump<W: Write, F: Fn() + Send + Sync>(
        &self,
        out: &mut W,
        force_presence: bool,
        debug: bool,
+        head: Option<usize>,
        filters: &[Box<dyn KmerFilter>],
+        on_partition: F,
    ) -> OKIResult<()> {
        let genomes    = &self.meta.genomes;
        let use_counts = self.meta.config.with_counts && !force_presence;
@@ -37,30 +45,76 @@ impl KmerIndex {
        }
        writeln!(out)?;

-        // ── Rows ──────────────────────────────────────────────────────────────
+        // ── Rows — parallel over partitions ───────────────────────────────────
        let n = self.n_partitions();
-        for i in 0..n {
-            if debug {
-                self.partition
-                    .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
-                        let seq = String::from_utf8(kmer.to_ascii())
-                            .unwrap_or_else(|_| "?".repeat(kmer_size));
-                        let _ = write!(out, "{part},{layer},{seq}");
-                        for &v in row.iter() { let _ = write!(out, ",{v}"); }
-                        let _ = writeln!(out);
-                    })
-                    .map_err(OKIError::Partition)?;
-            } else {
-                self.partition
-                    .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
-                        let seq = String::from_utf8(kmer.to_ascii())
-                            .unwrap_or_else(|_| "?".repeat(kmer_size));
-                        let _ = write!(out, "{seq}");
-                        for &v in row.iter() { let _ = write!(out, ",{v}"); }
-                        let _ = writeln!(out);
-                    })
-                    .map_err(OKIError::Partition)?;
-            }
+
+        let write_row = |buf: &mut Vec<u8>, row: &[u32], prefix: &str| {
+            let _ = buf.write_all(prefix.as_bytes());
+            for &v in row { let _ = write!(buf, ",{v}"); }
+            let _ = buf.write_all(b"\n");
+        };
+
+        let chunks: Vec<OKIResult<Vec<u8>>> = if let Some(limit) = head {
+            // ── Bounded: atomic counter, early exit when limit reached ────────
+            let remaining = AtomicUsize::new(limit);
+            (0..n).into_par_iter().map(|i| {
+                if remaining.load(Ordering::Relaxed) == 0 { return Ok(vec![]); }
+                let mut buf = Vec::<u8>::new();
+                let try_write = |buf: &mut Vec<u8>, row: &[u32], prefix: &str| -> bool {
+                    match remaining.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |cur| {
+                        if cur > 0 { Some(cur - 1) } else { None }
+                    }) {
+                        Err(_) => false,
+                        Ok(_)  => { write_row(buf, row, prefix); true }
+                    }
+                };
+                if debug {
+                    self.partition
+                        .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
+                            let seq = String::from_utf8(kmer.to_ascii()).unwrap_or_else(|_| "?".repeat(kmer_size));
+                            try_write(&mut buf, &row, &format!("{part},{layer},{seq}"))
+                        })
+                        .map_err(OKIError::Partition)?;
+                } else {
+                    self.partition
+                        .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
+                            let seq = String::from_utf8(kmer.to_ascii()).unwrap_or_else(|_| "?".repeat(kmer_size));
+                            try_write(&mut buf, &row, &seq)
+                        })
+                        .map_err(OKIError::Partition)?;
+                }
+                on_partition();
+                Ok(buf)
+            }).collect()
+        } else {
+            // ── Unbounded: no atomic, no contention ───────────────────────────
+            (0..n).into_par_iter().map(|i| {
+                let mut buf = Vec::<u8>::new();
+                if debug {
+                    self.partition
+                        .iter_partition_kmers_located(i, use_counts, n_genomes, filters, |part, layer, kmer, row| {
+                            let seq = String::from_utf8(kmer.to_ascii()).unwrap_or_else(|_| "?".repeat(kmer_size));
+                            write_row(&mut buf, &row, &format!("{part},{layer},{seq}"));
+                            true
+                        })
+                        .map_err(OKIError::Partition)?;
+                } else {
+                    self.partition
+                        .iter_partition_kmers(i, use_counts, n_genomes, filters, |kmer, row| {
+                            let seq = String::from_utf8(kmer.to_ascii()).unwrap_or_else(|_| "?".repeat(kmer_size));
+                            write_row(&mut buf, &row, &seq);
+                            true
+                        })
+                        .map_err(OKIError::Partition)?;
+                }
+                on_partition();
+                Ok(buf)
+            }).collect()
+        };
+
+        // ── Sequential write ──────────────────────────────────────────────────
+        for chunk in chunks {
+            out.write_all(&chunk?)?;
        }

        out.flush()?;
@@ -1,8 +1,6 @@
 use std::collections::BTreeMap;
 use std::fs;
 use std::path::{Path, PathBuf};
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, Mutex};

 use obikpartitionner::{KmerPartition, KmerSpectrum};
 use obilayeredmap;
@@ -152,31 +150,25 @@ impl KmerIndex {
        let with_counts = self.meta.config.with_counts;
        let evidence = self.meta.config.evidence.clone();
        let block_bits = self.meta.config.block_bits;
-        let total_kmers = AtomicUsize::new(0);
+        let mut total_kmers: usize = 0;
+        let pb = progress_bar("index", n as u64, "partitions");

-        let pb = Arc::new(Mutex::new(progress_bar("index", n as u64, "partitions")));
-
-        (0..n).into_par_iter().for_each(|i| {
-            match self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits) {
-                Ok(0) => {}
-                Ok(n_kmers) => {
-                    total_kmers.fetch_add(n_kmers, Ordering::Relaxed);
-                    let pb = pb.lock().unwrap();
+        let order: Vec<usize> = (0..n).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| self.partition.build_index_layer(i, min_ab, max_ab, with_counts, &evidence, block_bits),
+            |i, n_kmers, _| {
+                if n_kmers > 0 {
+                    total_kmers += n_kmers;
                    pb.inc(1);
                    pb.set_message(format!("{i}: {n_kmers} kmers"));
                }
-                Err(e) => {
-                    eprintln!("error building layer for partition {i}: {e}");
-                    std::process::exit(1);
-                }
-            }
-        });
+            },
+        ).map_err(OKIError::Partition)?;

-        pb.lock().unwrap().finish_and_clear();
-        info!(
-            "done — {} total kmers indexed",
-            total_kmers.load(Ordering::Relaxed)
-        );
+        pb.finish_and_clear();
+        info!("done — {} total kmers indexed", total_kmers);

        if !keep_intermediate {
            for i in 0..n {
@@ -211,35 +203,27 @@ impl KmerIndex {
        use obilayeredmap::meta::PartitionMeta;

        let n = self.n_partitions();
-        let errors: Vec<_> = (0..n)
-            .into_par_iter()
-            .filter_map(|i| {
+        let order: Vec<usize> = (0..n).collect();
+        let pb = progress_bar("pack", n as u64, "partitions");
+        crate::numa::PartitionRunner::new().run(
+            &order,
+            |i| -> OKIResult<()> {
                let index_dir = self.partition.part_dir(i).join("index");
-                if !index_dir.exists() { return None; }
-                let meta = match PartitionMeta::load(&index_dir) {
-                    Ok(m) => m,
-                    Err(e) => return Some(OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))),
-                };
+                if !index_dir.exists() { return Ok(()); }
+                let meta = PartitionMeta::load(&index_dir)
+                    .map_err(|e| OKIError::Io(std::io::Error::new(std::io::ErrorKind::Other, e.to_string())))?;
                for l in 0..meta.n_layers {
                    let layer_dir = index_dir.join(format!("layer_{l}"));
                    let presence_dir = layer_dir.join("presence");
                    let counts_dir   = layer_dir.join("counts");
-                    if presence_dir.exists() {
-                        if let Err(e) = pack_bit_matrix(&presence_dir) {
-                            return Some(OKIError::Io(e));
-                        }
-                    }
-                    if counts_dir.exists() {
-                        if let Err(e) = pack_compact_int_matrix(&counts_dir) {
-                            return Some(OKIError::Io(e));
-                        }
-                    }
+                    if presence_dir.exists() { pack_bit_matrix(&presence_dir).map_err(OKIError::Io)?; }
+                    if counts_dir.exists()   { pack_compact_int_matrix(&counts_dir).map_err(OKIError::Io)?; }
                }
-                None
-            })
-            .collect();
-
-        if let Some(e) = errors.into_iter().next() { return Err(e); }
+                Ok(())
+            },
+            |_, _, _| { pb.inc(1); },
+        )?;
+        pb.finish_and_clear();
        Ok(())
    }

@@ -5,8 +5,10 @@ mod distance;
 mod dump;
 mod index;
 mod merge;
+mod numa;
 mod rebuild;
 mod reindex;
+mod select;
 mod stats;

 pub use error::{OKIError, OKIResult};
@@ -2,36 +2,38 @@ use std::collections::HashMap;
 use std::fs;
 use std::io;
 use std::path::Path;
+
 use obisys::{Reporter, Stage, progress_bar, spinner};
-use rayon::prelude::*;
-use tracing::info;
+use tracing::{debug, info};

 use obilayeredmap::IndexMode;

 use crate::error::{OKIError, OKIResult};
 use crate::index::KmerIndex;
 use crate::meta::{GenomeInfo, IndexMeta};
-use crate::state::IndexState;
+use crate::state::{IndexState, SENTINEL_INDEXED};

 pub use obikpartitionner::MergeMode;

+// ── per-partition diagnostic record ──────────────────────────────────────────
+
+#[derive(Debug)]
+struct PartStat {
+    id: usize,
+    unitig_bytes: u64,
+    g_len: usize,
+}
+
+// ── main merge entry point ────────────────────────────────────────────────────
+
 impl KmerIndex {
-    /// Merge `sources` into a new index at `output`.
-    ///
-    /// All sources must be in `Indexed` state and share the same `kmer_size`,
-    /// `minimizer_size`, and `n_partitions`. Count mode additionally requires
-    /// every source to have `with_counts = true`.
-    ///
-    /// Genome labels must be unique across all sources. If `rename_duplicates`
-    /// is true, repeated labels are disambiguated by appending `.1`, `.2`, …
-    /// to the second and subsequent occurrences. Otherwise a
-    /// `DuplicateGenomeLabel` error is returned on the first conflict.
    pub fn merge<P: AsRef<Path>>(
        output: P,
        sources: &[&KmerIndex],
        mode: MergeMode,
        force: bool,
        rename_duplicates: bool,
+        budget_fraction: f64,
        rep: &mut Reporter,
    ) -> OKIResult<Self> {
        let output = output.as_ref();
@@ -49,9 +51,9 @@ impl KmerIndex {
            if src.state() != IndexState::Indexed {
                return Err(OKIError::NotIndexed(src.root_path.clone()));
            }
-            if src.kmer_size()      != ref0.kmer_size()
-            || src.minimizer_size() != ref0.minimizer_size()
-            || src.n_partitions()   != ref0.n_partitions()
+            if src.kmer_size() != ref0.kmer_size()
+                || src.minimizer_size() != ref0.minimizer_size()
+                || src.n_partitions() != ref0.n_partitions()
            {
                return Err(OKIError::IncompatibleConfig);
            }
@@ -61,44 +63,70 @@ impl KmerIndex {
        }

        // ── Log source characteristics and choose base ────────────────────────
-        let mode_str = if mode == MergeMode::Presence { "presence" } else { "count" };
+        let mode_str = if mode == MergeMode::Presence {
+            "presence"
+        } else {
+            "count"
+        };
        info!(
            "merge: {} source(s), smer-size={}, mode={}",
-            sources.len(), sources[0].kmer_size(), mode_str,
+            sources.len(),
+            sources[0].kmer_size(),
+            mode_str,
        );
        for (i, src) in sources.iter().enumerate() {
-            let genome_str = if src.meta.genomes.len() == 1 { "mono-genome".to_string() }
-                             else { format!("{} genomes", src.meta.genomes.len()) };
-            let trivial_str = if is_trivial(src, mode) { " [trivial: no data approximation]" } else { "" };
+            let genome_str = if src.meta.genomes.len() == 1 {
+                "mono-genome".to_string()
+            } else {
+                format!("{} genomes", src.meta.genomes.len())
+            };
+            let trivial_str = if is_trivial(src, mode) {
+                " [trivial: no data approximation]"
+            } else {
+                ""
+            };
            info!(
                "  [{}] {} — {}, {}, {}{}",
-                i, src.root_path.display(),
+                i,
+                src.root_path.display(),
                format_evidence(&src.meta.config.evidence),
-                genome_str, mode_str, trivial_str,
+                genome_str,
+                mode_str,
+                trivial_str,
            );
        }

        let base_idx = choose_base(sources, mode);
        let needs_approx = sources.iter().any(|src| {
            !is_trivial(src, mode)
-                && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
+                && matches!(
+                    src.meta.config.evidence,
+                    IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
+                )
        });
        info!(
            "output evidence: {} ({}base: [{}] {})",
            format_evidence(&sources[base_idx].meta.config.evidence),
-            if needs_approx { "forced approx — " } else { "" },
-            base_idx, sources[base_idx].root_path.display(),
+            if needs_approx {
+                "forced approx — "
+            } else {
+                ""
+            },
+            base_idx,
+            sources[base_idx].root_path.display(),
        );

        let mut ordered: Vec<&KmerIndex> = Vec::with_capacity(sources.len());
        ordered.push(sources[base_idx]);
        for (i, &src) in sources.iter().enumerate() {
-            if i != base_idx { ordered.push(src); }
+            if i != base_idx {
+                ordered.push(src);
+            }
        }
        let sources: &[&KmerIndex] = &ordered;
        let evidence = sources[0].meta.config.evidence.clone();

-        // ── Compute final genome labels (rename duplicates if requested) ───────
+        // ── Compute final genome labels ────────────────────────────────────────
        let (source_labels, all_genomes) = compute_labels(sources, rename_duplicates)?;

        // ── Prepare output directory ──────────────────────────────────────────
@@ -125,23 +153,19 @@ impl KmerIndex {
        pb.set_message("copying index …");
        copy_dir_all(&sources[0].root_path, output)?;

-        // Rewrite index.meta with final genome labels and the effective mode.
        let mut meta = IndexMeta::read(output).map_err(OKIError::Io)?;
        meta.genomes = all_genomes;
        meta.config.with_counts = mode == MergeMode::Count;
        meta.config.evidence = evidence.clone();
        meta.write(output)?;

-        // In presence/absence mode, purge counts/ directories inherited from
-        // source_0 — they are stale data from the source's count index.
        if mode == MergeMode::Presence {
            remove_dirs_named(output, "counts")?;
        }
        pb.finish_and_clear();
        rep.push(t.stop());

-        // Rebuild spectrums/ from all sources using the (possibly renamed) labels.
-        // Drop the spectrums/ that were copied from source_0 and rebuild from scratch.
+        // ── Rebuild spectrums ─────────────────────────────────────────────────
        info!("rebuilding spectrums for {} source(s)", sources.len());
        let t = Stage::start("spectrums");
        let pb = spinner("spectrums");
@@ -151,18 +175,19 @@ impl KmerIndex {
            fs::remove_dir_all(&spectrums_dir)?;
        }
        for (src, new_labels) in sources.iter().zip(&source_labels) {
-            let old_labels: Vec<String> = src.meta.genomes.iter().map(|g| g.label.clone()).collect();
+            let old_labels: Vec<String> =
+                src.meta.genomes.iter().map(|g| g.label.clone()).collect();
            copy_spectrums(&src.root_path, output, &old_labels, new_labels)?;
        }
        pb.finish_and_clear();
        rep.push(t.stop());

-        // Open the destination index.
+        // ── Open destination ──────────────────────────────────────────────────
        let dst = KmerIndex::open(output)?;
        let n_partitions = dst.n_partitions();
        let n_dst_genomes = sources[0].meta.genomes.len();

-        // ── Merge each subsequent source partition-by-partition ───────────────
+        // ── Merge partitions ──────────────────────────────────────────────────
        let remaining_sources: Vec<&KmerIndex> = sources[1..].to_vec();
        if !remaining_sources.is_empty() {
            let n_src_genomes: usize = remaining_sources.iter().map(|s| s.meta.genomes.len()).sum();
@@ -176,21 +201,53 @@ impl KmerIndex {
            let dst_partition = &dst.partition;
            let block_bits = dst.meta.config.block_bits;

-            let errors: Vec<obiskio::SKError> = (0..n_partitions)
-                .into_par_iter()
-                .filter_map(|i| {
-                    let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> =
-                        remaining_sources.iter().map(|s| (&s.partition, s.meta.genomes.len())).collect();
-                    let result = dst_partition.merge_partition(i, &srcs, mode, n_dst_genomes, block_bits, &evidence).err();
-                    pb.inc(1);
-                    result
+            // Pre-build source list once (avoid rebuilding per partition)
+            let srcs: Vec<(&obikpartitionner::KmerPartition, usize)> = remaining_sources
+                .iter()
+                .map(|s| (&s.partition, s.meta.genomes.len()))
+                .collect();
+
+            // Per-partition unitig byte sizes across remaining sources (stat() only)
+            let partition_sizes: Vec<u64> = (0..n_partitions)
+                .map(|i| {
+                    remaining_sources
+                        .iter()
+                        .map(|s| partition_unitig_bytes(s, i))
+                        .sum()
                })
                .collect();

+            // LFD sort: largest partition first
+            let mut order: Vec<usize> = (0..n_partitions).collect();
+            order.sort_unstable_by_key(|&i| std::cmp::Reverse(partition_sizes[i]));
+
+            let _ = budget_fraction; // kept in signature for CLI compatibility
+
+            // Shadow as references so closures can capture them by copy.
+            let srcs = &srcs;
+            let evidence = &evidence;
+
+            let runner = crate::numa::PartitionRunner::new();
+            let mut part_stats: Vec<PartStat> = Vec::with_capacity(n_partitions);
+
+            runner.run(
+                &order,
+                |i| dst_partition.merge_partition(i, srcs, mode, n_dst_genomes, block_bits, evidence),
+                |i, g_len, dur| {
+                    pb.inc(1);
+                    debug!(
+                        "partition {i}: done in {:.1}s — {} new kmers",
+                        dur.as_secs_f64(),
+                        g_len,
+                    );
+                    part_stats.push(PartStat { id: i, unitig_bytes: partition_sizes[i], g_len });
+                },
+            ).map_err(OKIError::Partition)?;
+
            pb.finish_and_clear();
-            if let Some(e) = errors.into_iter().next() {
-                return Err(OKIError::Partition(e));
-            }
+
+            // ── Diagnostic report ─────────────────────────────────────────────
+            print_merge_partition_report(&part_stats, runner.max_workers());

            rep.push(t.stop());
        }
@@ -206,19 +263,76 @@ impl KmerIndex {
            rep.push(t.stop());
        }

-        // Re-open to get the updated state.
+        fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
+
        KmerIndex::open(output)
    }
 }

-// ── Helpers ───────────────────────────────────────────────────────────────────
+// ── Diagnostic report ─────────────────────────────────────────────────────────
+
+fn print_merge_partition_report(stats: &[PartStat], max_workers: usize) {
+    let total_new: usize = stats.iter().map(|s| s.g_len).sum();
+    let non_empty = stats.iter().filter(|s| s.unitig_bytes > 0).count();
+
+    if non_empty == 0 {
+        info!("merge_partitions report: no data (all partitions empty)");
+        return;
+    }
+
+    info!("─── merge_partitions report ───");
+    info!(
+        "  {} partition(s) processed, {} total new kmers",
+        non_empty, total_new,
+    );
+    info!("  max workers: {max_workers}");
+
+    // Top 8 partitions by new-kmer count
+    let mut by_new: Vec<&PartStat> = stats.iter().filter(|s| s.g_len > 0).collect();
+    by_new.sort_by_key(|s| std::cmp::Reverse(s.g_len));
+    if !by_new.is_empty() {
+        info!("  top partitions by new kmers:");
+        for s in by_new.iter().take(8) {
+            info!(
+                "    partition {:4} : {}M new kmers  ({} unitig bytes)",
+                s.id,
+                s.g_len / 1_000_000,
+                fmt_bytes(s.unitig_bytes),
+            );
+        }
+    }
+    info!("───────────────────────────────");
+}
+
+// ── helpers ───────────────────────────────────────────────────────────────────
+
+fn fmt_bytes(b: u64) -> String {
+    if b >= 1 << 30 {
+        format!("{:.1} GB", b as f64 / (1u64 << 30) as f64)
+    } else if b >= 1 << 20 {
+        format!("{:.1} MB", b as f64 / (1u64 << 20) as f64)
+    } else if b >= 1 << 10 {
+        format!("{:.1} KB", b as f64 / (1u64 << 10) as f64)
+    } else {
+        format!("{b} B")
+    }
+}
+
+/// Sum of all unitigs.bin sizes across all layers of partition `i` in `src`.
+fn partition_unitig_bytes(src: &KmerIndex, i: usize) -> u64 {
+    let mut total = 0u64;
+    for l in 0.. {
+        let p = src.layer_unitigs_path(i, l);
+        if !p.exists() {
+            break;
+        }
+        if let Ok(m) = std::fs::metadata(&p) {
+            total += m.len();
+        }
+    }
+    total
+}

-/// Compute the final genome label lists for all sources.
-///
-/// Returns `(per_source_labels, all_genomes_flat)`.
-/// The first occurrence of a label keeps the original name. Subsequent
-/// occurrences receive `.1`, `.2`, … suffixes when `rename_duplicates` is true,
-/// or trigger a `DuplicateGenomeLabel` error otherwise.
 fn compute_labels(
    sources: &[&KmerIndex],
    rename_duplicates: bool,
@@ -241,7 +355,10 @@ fn compute_labels(
            };
            *count += 1;
            labels.push(new_label.clone());
-            all_genomes.push(GenomeInfo { label: new_label, meta: genome.meta.clone() });
+            all_genomes.push(GenomeInfo {
+                label: new_label,
+                meta: genome.meta.clone(),
+            });
        }
        source_labels.push(labels);
    }
@@ -249,8 +366,6 @@ fn compute_labels(
    Ok((source_labels, all_genomes))
 }

-/// Copy spectrum JSON files from `src_root/spectrums/` to `dst_root/spectrums/`,
-/// mapping each `old_labels[i]` filename to `new_labels[i]`.
 fn copy_spectrums(
    src_root: &Path,
    dst_root: &Path,
@@ -269,7 +384,6 @@ fn copy_spectrums(
    Ok(())
 }

-/// Recursively remove every directory named `name` under `root`.
 fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
    for entry in fs::read_dir(root)? {
        let entry = entry?;
@@ -285,56 +399,41 @@ fn remove_dirs_named(root: &Path, name: &str) -> io::Result<()> {
    Ok(())
 }

-
 fn format_evidence(ev: &IndexMode) -> String {
    match ev {
-        IndexMode::Exact             => "exact".to_string(),
-        IndexMode::Approx { b, z }   => format!("approx (b={b}, z={z})"),
-        IndexMode::Hybrid { b, z }   => format!("hybrid (b={b}, z={z})"),
+        IndexMode::Exact => "exact".to_string(),
+        IndexMode::Approx { b, z } => format!("approx (b={b}, z={z})"),
+        IndexMode::Hybrid { b, z } => format!("hybrid (b={b}, z={z})"),
    }
 }

-/// A source is "trivial" if its presence/count values carry no approximation:
-/// single-genome presence index (SetMembership — all values are 1 by construction).
 fn is_trivial(src: &KmerIndex, mode: MergeMode) -> bool {
    src.meta.genomes.len() == 1 && mode == MergeMode::Presence
 }

-/// Sum of all `unitigs.bin` sizes across every partition and layer.
-/// Used as a proxy for the number of indexed smers.
 fn index_unitig_size(src: &KmerIndex) -> u64 {
    let n = src.partition.n_partitions();
-    let mut total = 0u64;
-    for i in 0..n {
-        let index_dir = src.partition.part_dir(i).join("index");
-        let mut l = 0usize;
-        loop {
-            let p = index_dir.join(format!("layer_{l}")).join("unitigs.bin");
-            if !p.exists() { break; }
-            if let Ok(m) = std::fs::metadata(&p) { total += m.len(); }
-            l += 1;
-        }
-    }
-    total
+    (0..n).map(|i| partition_unitig_bytes(src, i)).sum()
 }

-/// Choose the index to use as bootstrap base.
-///
-/// Rule — mieux-disant: if any non-trivial source uses approximate evidence
-/// (Approx or Hybrid), the output must also be approximate; the base must
-/// therefore come from an approximate source so its layers carry the right
-/// evidence files.  Among qualifying candidates, the largest (by unitig size)
-/// is chosen to minimise the number of new smers in the merge layer.
 fn choose_base(sources: &[&KmerIndex], mode: MergeMode) -> usize {
    let needs_approx = sources.iter().any(|src| {
        !is_trivial(src, mode)
-            && matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
+            && matches!(
+                src.meta.config.evidence,
+                IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
+            )
    });

-    sources.iter().enumerate()
+    sources
+        .iter()
+        .enumerate()
        .filter(|(_, src)| {
            !needs_approx
-                || matches!(src.meta.config.evidence, IndexMode::Approx { .. } | IndexMode::Hybrid { .. })
+                || matches!(
+                    src.meta.config.evidence,
+                    IndexMode::Approx { .. } | IndexMode::Hybrid { .. }
+                )
        })
        .max_by_key(|(_, src)| index_unitig_size(src))
        .map(|(i, _)| i)
@@ -0,0 +1,251 @@
+// NUMA-aware partition runner via hwlocality.
+//
+// Detects NUMA topology using hwloc (cross-platform: Linux, macOS, etc.) and
+// builds one Rayon ThreadPool per NUMA node with threads pinned to that node's
+// CPUs. Linux first-touch policy then places graph allocations in local DRAM
+// automatically — no explicit memory binding needed.
+//
+// Returns None when:
+//   - hwloc topology initialisation fails
+//   - the system has only one NUMA node (UMA, Apple Silicon, single-socket)
+//   - any per-node pool fails to build
+
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use crossbeam_channel::unbounded;
+use hwlocality::Topology;
+use hwlocality::cpu::binding::CpuBindingFlags;
+use hwlocality::cpu::cpuset::CpuSet;
+use hwlocality::object::types::ObjectType;
+use tracing::debug;
+
+// ── Public interface ──────────────────────────────────────────────────────────
+
+pub struct NumaSetup {
+    pub pools: Vec<Arc<rayon::ThreadPool>>,
+    /// CPU indices for each NUMA node, in node order.
+    pub cpus_per_node: Vec<Vec<usize>>,
+}
+
+impl NumaSetup {
+    /// Workers to activate per NUMA node.
+    /// Empirically ~3 workers saturate one node's memory bandwidth.
+    pub fn workers_per_node(&self) -> usize {
+        self.cpus_per_node
+            .first()
+            .map(|c| (c.len() / 8).max(3).min(8))
+            .unwrap_or(3)
+    }
+}
+
+/// Detect NUMA topology and build per-node Rayon pools.
+/// Returns None on UMA systems, single-node machines, or on failure.
+pub fn build() -> Option<NumaSetup> {
+    let topology = Topology::new().ok()?;
+
+    let nodes: Vec<Vec<usize>> = topology
+        .objects_with_type(ObjectType::NUMANode)
+        .filter_map(|obj| obj.cpuset())
+        .map(|cpuset| {
+            cpuset
+                .iter_set()
+                .map(|idx| usize::from(idx))
+                .collect::<Vec<_>>()
+        })
+        .filter(|v| !v.is_empty())
+        .collect();
+
+    if nodes.len() <= 1 {
+        return None;
+    }
+
+    debug!(
+        "NUMA topology: {} node(s), {} core(s)/node",
+        nodes.len(),
+        nodes.first().map_or(0, |v| v.len()),
+    );
+
+    let pools = nodes
+        .iter()
+        .map(|cpus| build_pool(cpus).map(Arc::new))
+        .collect::<Option<Vec<_>>>()?;
+
+    Some(NumaSetup { pools, cpus_per_node: nodes })
+}
+
+/// Bind the calling thread to `cpu_indices` using hwloc.
+/// Silently returns on any error so the thread still runs, just unbound.
+pub fn pin_current_thread(cpu_indices: &[usize]) {
+    let Ok(topology) = Topology::new() else { return };
+    let mut cpuset = CpuSet::new();
+    for &idx in cpu_indices {
+        cpuset.set(idx);
+    }
+    let _ = topology.bind_cpu(&cpuset, CpuBindingFlags::THREAD);
+}
+
+// ── Internal helpers ──────────────────────────────────────────────────────────
+
+fn build_pool(cpus: &[usize]) -> Option<rayon::ThreadPool> {
+    let cpus = cpus.to_vec();
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(cpus.len())
+        .spawn_handler(move |thread| {
+            let cpus = cpus.clone();
+            std::thread::Builder::new().spawn(move || {
+                pin_current_thread(&cpus);
+                thread.run();
+            })?;
+            Ok(())
+        })
+        .build()
+        .ok()
+}
+
+// ── PartitionRunner ───────────────────────────────────────────────────────────
+
+struct NodeConfig {
+    pool:        Option<Arc<rayon::ThreadPool>>,
+    cpu_ids:     Vec<usize>,
+    max_workers: usize,
+}
+
+/// Generic NUMA-aware runner for partition-level parallel work.
+///
+/// Workers are distributed round-robin across NUMA nodes and pinned to their
+/// node's CPUs.  UMA systems are the degenerate case: one node, no pinning.
+///
+/// # Termination
+///
+/// Termination is driven entirely by channel closure:
+///
+/// ```text
+/// drop(part_tx)    → part_rx drains → workers exit → drop their result_tx
+/// drop(result_tx)  → result_rx closes → controller loop exits
+/// ```
+///
+/// No explicit counter or sentinel needed.
+pub struct PartitionRunner {
+    nodes: Vec<NodeConfig>,
+}
+
+impl PartitionRunner {
+    /// Total worker slots across all nodes.
+    pub fn max_workers(&self) -> usize {
+        self.nodes.iter().map(|n| n.max_workers).sum()
+    }
+
+    /// Detect topology and build.  Falls back to a single-node UMA runner on
+    /// macOS, single-socket machines, or hwloc failure.
+    pub fn new() -> Self {
+        match build() {
+            Some(ns) => {
+                let wpn = ns.workers_per_node();
+                debug!(
+                    "PartitionRunner: NUMA mode — {} node(s) × {} worker(s)/node",
+                    ns.pools.len(), wpn,
+                );
+                let nodes = ns.pools
+                    .into_iter()
+                    .zip(ns.cpus_per_node)
+                    .map(|(pool, cpu_ids)| NodeConfig {
+                        pool:        Some(pool),
+                        cpu_ids,
+                        max_workers: wpn,
+                    })
+                    .collect();
+                Self { nodes }
+            }
+            None => {
+                let n_cores = std::thread::available_parallelism()
+                    .map(|n| n.get())
+                    .unwrap_or(1);
+                let max_workers = (n_cores / 2).max(1);
+                debug!("PartitionRunner: UMA mode — {} worker(s)", max_workers);
+                Self {
+                    nodes: vec![NodeConfig {
+                        pool:        None,
+                        cpu_ids:     vec![],
+                        max_workers,
+                    }],
+                }
+            }
+        }
+    }
+
+    /// Run `f(i)` for every index in `order`.
+    ///
+    /// Workers are spawned upfront and distributed round-robin across NUMA
+    /// nodes.  `on_done(i, result, elapsed)` is called from the controller
+    /// thread as each partition completes — suitable for progress bars and
+    /// result aggregation.
+    ///
+    /// Returns the first error produced by `f`, if any.
+    pub fn run<F, R, E, C>(
+        &self,
+        order:      &[usize],
+        f:          F,
+        mut on_done: C,
+    ) -> Result<(), E>
+    where
+        F: Fn(usize) -> Result<R, E> + Send + Sync,
+        R: Send,
+        E: Send,
+        C: FnMut(usize, R, Duration) + Send,
+    {
+        // Pre-load the work queue, then drop the sender so workers' part_rx
+        // iterators terminate when the queue is drained.
+        let (part_tx, part_rx) = unbounded::<usize>();
+        for &i in order { part_tx.send(i).ok(); }
+        drop(part_tx);
+
+        let (result_tx, result_rx) = unbounded::<(usize, Result<R, E>, Duration)>();
+        let n_nodes = self.nodes.len();
+        let f = &f; // shared borrow; F: Sync so concurrent calls are safe
+
+        let mut first_err: Option<E> = None;
+
+        std::thread::scope(|s| {
+            // Spawn all workers upfront, round-robin across NUMA nodes.
+            for w in 0..self.max_workers() {
+                let node    = &self.nodes[w % n_nodes];
+                let prx     = part_rx.clone();
+                let rtx     = result_tx.clone();
+                let pool    = node.pool.clone();
+                let cpu_ids = &node.cpu_ids;
+
+                s.spawn(move || {
+                    if !cpu_ids.is_empty() { pin_current_thread(cpu_ids); }
+                    for i in &prx {
+                        let t = Instant::now();
+                        let r = match &pool {
+                            Some(p) => p.install(|| f(i)),
+                            None    => f(i),
+                        };
+                        rtx.send((i, r, t.elapsed())).ok();
+                    }
+                });
+            }
+
+            // Drop the controller's sender: result_rx closes once all worker
+            // rtx clones are dropped (i.e. all workers have exited).
+            drop(result_tx);
+
+            // Drain results concurrently with workers.  The for loop exits
+            // when result_rx is disconnected — at that point all workers are
+            // done and the scope join below is instantaneous.
+            for (i, r, dur) in &result_rx {
+                match r {
+                    Ok(v)  => on_done(i, v, dur),
+                    Err(e) => { if first_err.is_none() { first_err = Some(e); } }
+                }
+            }
+        });
+
+        match first_err {
+            Some(e) => Err(e),
+            None    => Ok(()),
+        }
+    }
+}
@@ -4,7 +4,6 @@ use std::path::Path;

 use obikpartitionner::{KmerFilter, KmerPartition, MergeMode};
 use obisys::{Reporter, Stage, progress_bar};
-use rayon::prelude::*;
 use tracing::info;

 use crate::error::{OKIError, OKIResult};
@@ -83,30 +82,25 @@ impl KmerIndex {
        let src_partition = &src.partition;
        let block_bits = meta.config.block_bits;

-        let errors: Vec<obiskio::SKError> = (0..n_partitions)
-            .into_par_iter()
-            .filter_map(|i| {
-                let result = dst_partition
-                    .rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits)
-                    .err();
-                pb.inc(1);
-                result
-            })
-            .collect();
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| dst_partition.rebuild_partition(src_partition, i, filters, mode, n_genomes, block_bits),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;

        pb.finish_and_clear();

-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::Partition(e));
-        }
-
        rep.push(t.stop());

        // Write SENTINEL_INDEXED — output is ready to use.
        fs::File::create(output.join(SENTINEL_INDEXED))?;

        let idx = KmerIndex::open(output)?;
+        let t_pack = Stage::start("pack");
        idx.pack_matrices()?;
+        rep.push(t_pack.stop());
        Ok(idx)
    }
 }
@@ -3,7 +3,6 @@ use std::path::Path;
 use obilayeredmap::{IndexMode, layer::Layer};
 use obilayeredmap::meta::PartitionMeta;
 use obisys::{Reporter, Stage, progress_bar};
-use rayon::prelude::*;
 use tracing::info;

 use crate::error::{OKIError, OKIResult};
@@ -45,25 +44,17 @@ impl KmerIndex {
        let t = Stage::start("reindex");
        let pb = progress_bar("reindex", n as u64, "partitions");

-        let errors: Vec<String> = (0..n)
-            .into_par_iter()
-            .filter_map(|i| {
-                let res = reindex_partition(
-                    &self.partition.part_dir(i).join("index"),
-                    &target,
-                    block_bits,
-                );
-                pb.inc(1);
-                res.err().map(|e| format!("partition {i}: {e}"))
-            })
-            .collect();
+        let order: Vec<usize> = (0..n).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| reindex_partition(&self.partition.part_dir(i).join("index"), &target, block_bits)
+                .map_err(|e| OKIError::InvalidInput(format!("partition {i}: {e}"))),
+            |_, _, _| { pb.inc(1); },
+        )?;

        pb.finish_and_clear();

-        if let Some(e) = errors.into_iter().next() {
-            return Err(OKIError::InvalidInput(e));
-        }
-
        self.meta.config.evidence = target;
        if matches!(self.meta.config.evidence, IndexMode::Exact) {
            self.meta.config.block_bits = block_bits;
@@ -0,0 +1,150 @@
+use std::fs;
+use std::io;
+use std::path::Path;
+
+use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR};
+use obisys::{Reporter, Stage, progress_bar};
+use tracing::info;
+
+use crate::error::{OKIError, OKIResult};
+use crate::index::KmerIndex;
+use crate::meta::{GenomeInfo, IndexMeta};
+use crate::state::{IndexState, SENTINEL_INDEXED};
+
+impl KmerIndex {
+    /// Create a new index at `output` by projecting/aggregating the genome columns
+    /// of `src` according to `specs`.
+    ///
+    /// `output_presence` — if true, output uses bit matrices (0/1), regardless of
+    /// whether the source stores counts. The caller is responsible for ensuring all
+    /// specs use logical operators when `output_presence=true` on a count source.
+    pub fn select<P: AsRef<Path>>(
+        output: P,
+        src: &KmerIndex,
+        specs: &[OutputCol],
+        threshold: u32,
+        output_presence: bool,
+        force: bool,
+        rep: &mut Reporter,
+    ) -> OKIResult<Self> {
+        let output = output.as_ref();
+
+        if src.state() != IndexState::Indexed {
+            return Err(OKIError::NotIndexed(src.root_path.clone()));
+        }
+
+        if output.exists() {
+            if force {
+                fs::remove_dir_all(output)?;
+            } else {
+                return Err(OKIError::Io(io::Error::new(
+                    io::ErrorKind::AlreadyExists,
+                    format!("{}: output directory already exists", output.display()),
+                )));
+            }
+        }
+
+        fs::create_dir_all(output)?;
+        let mut meta = IndexMeta::new(src.meta.config.clone());
+        meta.config.with_counts = !output_presence;
+        meta.genomes = specs.iter()
+            .map(|s| GenomeInfo::new(s.label.clone()))
+            .collect();
+        meta.write(output)?;
+
+        let n_src_genomes  = src.meta.genomes.len();
+        let n_partitions   = src.partition.n_partitions();
+
+        fs::create_dir_all(output.join(PARTITIONS_SUBDIR))?;
+        let dst_partition = KmerPartition::open_with_config(
+            output,
+            meta.config.kmer_size,
+            meta.config.minimizer_size,
+            meta.config.n_bits,
+        )?;
+
+        info!(
+            "select: {} partition(s), {} source genome(s) → {} output column(s)",
+            n_partitions, n_src_genomes, specs.len(),
+        );
+
+        let t   = Stage::start("select");
+        let pb  = progress_bar("select", n_partitions as u64, "partitions");
+        let src_partition = &src.partition;
+
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| dst_partition.select_partition(src_partition, i, specs, n_src_genomes, threshold, output_presence, false),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;
+
+        pb.finish_and_clear();
+        rep.push(t.stop());
+
+        fs::File::create(output.join(SENTINEL_INDEXED))?;
+
+        let idx = KmerIndex::open(output)?;
+        let t_pack = Stage::start("pack");
+        idx.pack_matrices()?;
+        rep.push(t_pack.stop());
+        Ok(idx)
+    }
+
+    /// Rewrite the genome columns of this index in-place according to `specs`.
+    ///
+    /// The MPHF and unitig files are unchanged; only data matrices are rewritten.
+    pub fn select_in_place(
+        &mut self,
+        specs: &[OutputCol],
+        threshold: u32,
+        output_presence: bool,
+        rep: &mut Reporter,
+    ) -> OKIResult<()> {
+        if self.state() != IndexState::Indexed {
+            return Err(OKIError::NotIndexed(self.root_path.clone()));
+        }
+
+        let n_src_genomes = self.meta.genomes.len();
+        let n_partitions  = self.partition.n_partitions();
+
+        let src_partition = KmerPartition::open_with_config(
+            &self.root_path,
+            self.meta.config.kmer_size,
+            self.meta.config.minimizer_size,
+            self.meta.config.n_bits,
+        )?;
+
+        info!(
+            "select (in-place): {} partition(s), {} source genome(s) → {} output column(s)",
+            n_partitions, n_src_genomes, specs.len(),
+        );
+
+        let t  = Stage::start("select");
+        let pb = progress_bar("select", n_partitions as u64, "partitions");
+
+        let partition = &self.partition;
+        let order: Vec<usize> = (0..n_partitions).collect();
+        let runner = crate::numa::PartitionRunner::new();
+        runner.run(
+            &order,
+            |i| partition.select_partition(&src_partition, i, specs, n_src_genomes, threshold, output_presence, true),
+            |_, _, _| { pb.inc(1); },
+        ).map_err(OKIError::Partition)?;
+
+        pb.finish_and_clear();
+        rep.push(t.stop());
+
+        self.meta.config.with_counts = !output_presence;
+        self.meta.genomes = specs.iter()
+            .map(|s| GenomeInfo::new(s.label.clone()))
+            .collect();
+        self.meta.write(&self.root_path)?;
+
+        let t_pack = Stage::start("pack");
+        self.pack_matrices()?;
+        rep.push(t_pack.stop());
+        Ok(())
+    }
+}
@@ -1,7 +1,8 @@
 use std::fs;
 use std::path::Path;

-use obicompactvec::LayerMeta;
+use obicompactvec::{LayerMeta, PersistentBitMatrix, PersistentCompactIntMatrix};
+use obicompactvec::traits::ColumnWeights;
 use obilayeredmap::meta::PartitionMeta;
 use rayon::prelude::*;

@@ -124,4 +125,68 @@ impl KmerIndex {
            total:             bpk(mphf_b + evidence_b + matrix_b),
        })
    }
+
+    /// Return `(total_distinct_kmers, per_genome_kmer_counts)`.
+    ///
+    /// For each genome, the count is the number of distinct k-mers for which
+    /// that genome has a non-zero value (presence = 1, count > 0).
+    /// Partitions are scanned in parallel; results are summed across partitions.
+    pub fn genome_kmer_counts(&self) -> OKIResult<(usize, Vec<u64>)> {
+        let n          = self.n_partitions();
+        let n_genomes  = self.meta.genomes.len();
+
+        let partials: Vec<(usize, Vec<u64>)> = (0..n)
+            .into_par_iter()
+            .map(|i| {
+                let mut counts  = vec![0u64; n_genomes];
+                let mut n_kmers = 0usize;
+
+                let index_dir = self.partition.part_dir(i).join("index");
+                if !index_dir.exists() { return (0, counts); }
+
+                let n_layers = PartitionMeta::load(&index_dir)
+                    .map(|m| m.n_layers)
+                    .unwrap_or(0);
+
+                for l in 0..n_layers {
+                    let layer_dir = index_dir.join(format!("layer_{l}"));
+                    if !layer_dir.exists() { continue; }
+
+                    n_kmers += LayerMeta::load(&layer_dir).map(|m| m.n).unwrap_or(0);
+
+                    let mat: Box<dyn ColumnWeights> =
+                        if layer_dir.join("counts").exists()
+                            && !layer_dir.join("presence").exists()
+                        {
+                            match PersistentCompactIntMatrix::open(&layer_dir) {
+                                Ok(m)  => Box::new(m),
+                                Err(_) => continue,
+                            }
+                        } else {
+                            match PersistentBitMatrix::open(&layer_dir) {
+                                Ok(m)  => Box::new(m),
+                                Err(_) => continue,
+                            }
+                        };
+                    let col_counts = mat.partial_kmer_counts();
+
+                    for (c, &v) in col_counts.iter().enumerate() {
+                        if c < n_genomes { counts[c] += v; }
+                    }
+                }
+
+                (n_kmers, counts)
+            })
+            .collect();
+
+        let total_kmers: usize = partials.iter().map(|(n, _)| n).sum();
+        let mut total_counts   = vec![0u64; n_genomes];
+        for (_, counts) in partials {
+            for (i, v) in counts.into_iter().enumerate() {
+                total_counts[i] += v;
+            }
+        }
+
+        Ok((total_kmers, total_counts))
+    }
 }
@@ -1,6 +1,6 @@
 [package]
 name = "obikmer"
-version = "0.1.0"
+version = "1.1.13"
 edition = "2024"

 [[bin]]
@@ -19,6 +19,7 @@ obikpartitionner = { path = "../obikpartitionner" }
 obisys        = { path = "../obisys" }
 obiskio       = { path = "../obiskio" }
 obikindex     = { path = "../obikindex" }
+obitaxonomy   = { path = "../obitaxonomy" }
 obilayeredmap = { path = "../obilayeredmap" }
 clap          = { version = "4", features = ["derive"] }
 serde_json    = "1"
@@ -1,9 +1,9 @@
 use std::path::PathBuf;
-use std::sync::{Arc, Condvar, Mutex};

 use clap::Args;
 use obiread::NucPage;
 use obikseq::RoutableSuperKmer;
+use obipipeline::Throttled;

 // ── Shared arguments ──────────────────────────────────────────────────────────

@@ -103,54 +103,10 @@ impl CommonArgs {
    }
 }

-// ── Open-file throttling ──────────────────────────────────────────────────────
-
-struct FileSlots {
-    count:   Mutex<usize>,
-    condvar: Condvar,
-    max:     usize,
-}
-
-impl FileSlots {
-    fn new(max: usize) -> Self {
-        Self { count: Mutex::new(0), condvar: Condvar::new(), max }
-    }
-
-    fn acquire(&self) {
-        let mut count = self.count.lock().unwrap();
-        while *count >= self.max {
-            count = self.condvar.wait(count).unwrap();
-        }
-        *count += 1;
-    }
-
-    fn release(&self) {
-        let mut count = self.count.lock().unwrap();
-        *count -= 1;
-        self.condvar.notify_one();
-    }
-}
-
-struct SlotsGuard(Arc<FileSlots>);
-
-impl Drop for SlotsGuard {
-    fn drop(&mut self) {
-        self.0.release();
-    }
-}
-
 // ── Pipeline data carrier ─────────────────────────────────────────────────────

-/// A path bundled with an opaque guard token.
-/// The guard is acquired in the source thread and dropped by the flat worker
-/// once the file is fully read, releasing the open-file slot.
-pub struct PathWithSlot {
-    pub path:   PathBuf,
-    pub _guard: Box<dyn Send + 'static>,
-}
-
 pub enum PipelineData {
-    Path(PathWithSlot),
+    Path(Throttled<PathBuf>),
    NucPage(NucPage),
    Batch(Vec<RoutableSuperKmer>),
 }
@@ -158,20 +114,3 @@ pub enum PipelineData {
 unsafe impl Send for PipelineData {}
 unsafe impl Sync for PipelineData {}

-/// Wrap a path iterator so that at most `max_open` files are open simultaneously.
-/// Acquisition happens in the caller's thread (the pipeline source thread),
-/// never inside a worker, preventing deadlocks.
-pub fn throttle_paths(
-    source: impl Iterator<Item = PathBuf> + Send + 'static,
-    max_open: usize,
-) -> impl Iterator<Item = PathWithSlot> + Send + 'static {
-    let slots = Arc::new(FileSlots::new(max_open));
-    source.map(move |path| {
-        slots.acquire();
-        PathWithSlot {
-            path,
-            _guard: Box::new(SlotsGuard(Arc::clone(&slots))),
-        }
-    })
-}
-
@@ -3,6 +3,7 @@ use std::path::PathBuf;

 use clap::Args;
 use obikindex::KmerIndex;
+use obisys::progress_bar;
 use tracing::info;

 use super::predicate::FilterArgs;
@@ -20,6 +21,10 @@ pub struct DumpArgs {
    #[arg(long, default_value_t = false)]
    pub debug: bool,

+    /// Only output the first N kmers
+    #[arg(long)]
+    pub head: Option<usize>,
+
    #[command(flatten)]
    pub filter: FilterArgs,
 }
@@ -37,12 +42,14 @@ pub fn run(args: DumpArgs) {
    );

    let filters = args.filter.build_filters(&idx.meta().genomes);
+    let pb = progress_bar("dump", idx.n_partitions() as u64, "partitions");

    let stdout = io::stdout();
    let mut out = BufWriter::new(stdout.lock());

-    idx.dump(&mut out, args.force_presence, args.debug, &filters).unwrap_or_else(|e| {
+    idx.dump(&mut out, args.force_presence, args.debug, args.head, &filters, || pb.inc(1)).unwrap_or_else(|e| {
        eprintln!("dump error: {e}");
        std::process::exit(1);
    });
+    pb.finish_and_clear();
 }
@@ -6,10 +6,10 @@ use obikpartitionner::filter::{MaxTotalCount, MinTotalCount};
 use obisys::Reporter;
 use tracing::info;

-use super::predicate::FilterArgs;
+use super::predicate::FilterArgs as KmerFilterArgs;

 #[derive(Args)]
-pub struct RebuildArgs {
+pub struct FilterArgs {
    /// Source index directory
    pub source: PathBuf,

@@ -18,7 +18,7 @@ pub struct RebuildArgs {
    pub output: PathBuf,

    #[command(flatten)]
-    pub filter: FilterArgs,
+    pub filter: KmerFilterArgs,

    /// Minimum total count across all genomes (count index only)
    #[arg(long)]
@@ -37,7 +37,7 @@ pub struct RebuildArgs {
    pub force: bool,
 }

-pub fn run(args: RebuildArgs) {
+pub fn run(args: FilterArgs) {
    let src = KmerIndex::open(&args.source).unwrap_or_else(|e| {
        eprintln!("error opening source index: {e}");
        std::process::exit(1);
@@ -50,7 +50,7 @@ pub fn run(args: RebuildArgs) {
    };

    info!(
-        "rebuild: {} genome(s), mode={:?}, source={}",
+        "filter: {} genome(s), mode={:?}, source={}",
        &src.meta().genomes.len(), mode, args.source.display()
    );

@@ -66,10 +66,10 @@ pub fn run(args: RebuildArgs) {
    let mut rep = Reporter::new();
    KmerIndex::rebuild(&args.output, &src, &filters, mode, args.force, &mut rep)
        .unwrap_or_else(|e| {
-            eprintln!("error rebuilding index: {e}");
+            eprintln!("error filtering index: {e}");
            std::process::exit(1);
        });

    rep.print();
-    info!("rebuilt index → {}", args.output.display());
+    info!("filtered index → {}", args.output.display());
 }
@@ -26,6 +26,11 @@ pub struct MergeArgs {
    /// Disambiguate duplicate genome labels by appending .1, .2, … instead of erroring
    #[arg(long, default_value_t = false)]
    pub rename_duplicates: bool,
+
+    /// Fraction of available RAM reserved as memory budget for parallel partition merging.
+    /// Reduce if OOM occurs despite the adaptive scheduler (e.g. --budget-fraction 0.3).
+    #[arg(long, default_value_t = 0.5)]
+    pub budget_fraction: f64,
 }

 pub fn run(args: MergeArgs) {
@@ -60,7 +65,7 @@ pub fn run(args: MergeArgs) {
    );

    let mut rep = Reporter::new();
-    KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates, &mut rep).unwrap_or_else(|e| {
+    KmerIndex::merge(&args.output, &source_refs, mode, args.force, args.rename_duplicates, args.budget_fraction, &mut rep).unwrap_or_else(|e| {
        eprintln!("error merging: {e}");
        std::process::exit(1);
    });
@@ -1,6 +1,8 @@
 pub mod annotate;
+pub mod filter;
 pub mod pack;
 pub(crate) mod predicate;
+pub mod select;
 pub mod utils;
 pub mod distance;
 pub mod dump;
@@ -8,7 +10,6 @@ pub mod estimate;
 pub mod index;
 pub mod merge;
 pub mod query;
-pub mod rebuild;
 pub mod reindex;
 pub mod superkmer;
 pub mod unitig;
@@ -3,6 +3,7 @@ use std::collections::HashMap;
 use clap::Args;
 use obikindex::GenomeInfo;
 use obikpartitionner::{GroupQuorumFilter, KmerFilter};
+use obitaxonomy::{TaxPath, TaxPattern};

 // ── Operator ──────────────────────────────────────────────────────────────────

@@ -49,7 +50,6 @@ impl MetaPred {
        if values.iter().any(|v| v.is_empty()) {
            return Err(format!("empty value in predicate: {s}"));
        }
-
        Ok(Self { key, op, values })
    }

@@ -70,18 +70,15 @@ impl MetaPred {

 // ── Path matching ─────────────────────────────────────────────────────────────

-/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
+/// True if the stored taxonomy `value` matches `pattern`.
 ///
-/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
-/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
+/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`).
+/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax).
+/// Returns `false` if either fails to parse.
 fn path_matches(value: &str, pattern: &str) -> bool {
-    if pattern.starts_with('/') {
-        value == pattern
-            || (value.starts_with(pattern)
-                && value[pattern.len()..].starts_with('/'))
-    } else {
-        value.split('/').any(|seg| seg == pattern)
-    }
+    let Ok(path) = TaxPath::parse(value)    else { return false };
+    let Ok(pat)  = TaxPattern::parse(pattern) else { return false };
+    pat.matches(&path)
 }

 // ── Three-value group evaluation ──────────────────────────────────────────────
@@ -162,6 +159,7 @@ pub struct FilterArgs {
    pub max_count: Option<usize>,

    /// Minimum fraction of ingroup genomes containing the k-mer [0.0–1.0]
+    /// (default 1.0 when --ingroup is set, 0.0 otherwise)
    #[arg(long)]
    pub min_frac: Option<f64>,

@@ -174,6 +172,7 @@ pub struct FilterArgs {
    pub min_outgroup_count: Option<usize>,

    /// Maximum number of outgroup genomes containing the k-mer
+    /// (default 0 when --outgroup is set, no constraint otherwise)
    #[arg(long)]
    pub max_outgroup_count: Option<usize>,

@@ -205,7 +204,7 @@ impl FilterArgs {
                std::process::exit(1);
            }))
            .collect();
-        vec![Box::new(build_group_filter(
+        let filter = build_group_filter(
            genomes,
            &ingroup_preds,
            &outgroup_preds,
@@ -220,10 +219,24 @@ impl FilterArgs {
                min_outgroup_frac:  self.min_outgroup_frac,
                max_outgroup_frac:  self.max_outgroup_frac,
            },
-        ))]
+        ).unwrap_or_else(|e| {
+            eprintln!("error in filter parameters: {e}");
+            std::process::exit(1);
+        });
+        vec![Box::new(filter)]
    }
 }

+/// Returns indices of genomes matching `pred_str` (single predicate).
+pub fn matching_genome_indices(pred_str: &str, genomes: &[GenomeInfo]) -> Result<Vec<usize>, String> {
+    let pred = MetaPred::parse(pred_str)?;
+    Ok(genomes.iter().enumerate()
+        .filter_map(|(i, g)| {
+            if pred.eval(&g.meta) == Some(true) { Some(i) } else { std::option::Option::None }
+        })
+        .collect())
+}
+
 pub struct GroupFilterParams {
    pub threshold:          u32,
    pub min_count:          Option<usize>,
@@ -241,7 +254,7 @@ pub fn build_group_filter(
    ingroup_preds:  &[MetaPred],
    outgroup_preds: &[MetaPred],
    p:              GroupFilterParams,
-) -> GroupQuorumFilter {
+) -> Result<GroupQuorumFilter, String> {
    let (ingroup_idx, outgroup_idx) = if ingroup_preds.is_empty() && outgroup_preds.is_empty() {
        ((0..genomes.len()).collect(), vec![])
    } else {
@@ -258,17 +271,52 @@ pub fn build_group_filter(
    let in_size  = ingroup_idx.len();
    let out_size = outgroup_idx.len();

-    GroupQuorumFilter {
+    let ingroup_quorum_explicit  = p.min_count.is_some() || p.max_count.is_some()
+        || p.min_frac.is_some() || p.max_frac.is_some();
+    let outgroup_quorum_explicit = p.min_outgroup_count.is_some() || p.max_outgroup_count.is_some()
+        || p.min_outgroup_frac.is_some() || p.max_outgroup_frac.is_some();
+
+    let default_min_frac           = if !ingroup_preds.is_empty()  && !ingroup_quorum_explicit  { 1.0      } else { 0.0      };
+    let default_max_outgroup_count = if !outgroup_preds.is_empty() && !outgroup_quorum_explicit { 0        } else { out_size };
+
+    let min_count          = p.min_count.unwrap_or(0);
+    let max_count          = p.max_count.unwrap_or(in_size);
+    let min_frac           = p.min_frac.unwrap_or(default_min_frac);
+    let max_frac           = p.max_frac.unwrap_or(1.0);
+    let min_outgroup_count = p.min_outgroup_count.unwrap_or(0);
+    let max_outgroup_count = p.max_outgroup_count.unwrap_or(default_max_outgroup_count);
+    let min_outgroup_frac  = p.min_outgroup_frac.unwrap_or(0.0);
+    let max_outgroup_frac  = p.max_outgroup_frac.unwrap_or(1.0);
+
+    for (v, lo, hi) in [
+        ("--min-frac/--max-frac",                        min_frac,           max_frac),
+        ("--min-outgroup-frac/--max-outgroup-frac",       min_outgroup_frac,  max_outgroup_frac),
+    ] {
+        if !(0.0..=1.0).contains(&lo) || !(0.0..=1.0).contains(&hi) {
+            return Err(format!("{v}: fraction values must be in [0.0, 1.0]"));
+        }
+        if lo > hi {
+            return Err(format!("{v}: min ({lo}) is greater than max ({hi})"));
+        }
+    }
+    if min_count > max_count {
+        return Err(format!("--min-count/--max-count: min ({min_count}) is greater than max ({max_count})"));
+    }
+    if min_outgroup_count > max_outgroup_count {
+        return Err(format!("--min-outgroup-count/--max-outgroup-count: min ({min_outgroup_count}) is greater than max ({max_outgroup_count})"));
+    }
+
+    Ok(GroupQuorumFilter {
        ingroup_idx,
        outgroup_idx,
-        threshold:          p.threshold,
-        min_count:          p.min_count.unwrap_or(0),
-        max_count:          p.max_count.unwrap_or(in_size),
-        min_frac:           p.min_frac.unwrap_or(0.0),
-        max_frac:           p.max_frac.unwrap_or(1.0),
-        min_outgroup_count: p.min_outgroup_count.unwrap_or(0),
-        max_outgroup_count: p.max_outgroup_count.unwrap_or(out_size),
-        min_outgroup_frac:  p.min_outgroup_frac.unwrap_or(0.0),
-        max_outgroup_frac:  p.max_outgroup_frac.unwrap_or(1.0),
-    }
+        threshold: p.threshold,
+        min_count,
+        max_count,
+        min_frac,
+        max_frac,
+        min_outgroup_count,
+        max_outgroup_count,
+        min_outgroup_frac,
+        max_outgroup_frac,
+    })
 }
@@ -0,0 +1,253 @@
+use std::collections::{BTreeMap, HashMap};
+use std::path::PathBuf;
+
+use clap::{Args, ValueEnum};
+use obikindex::{GenomeInfo, KmerIndex};
+use obikpartitionner::{AggOp, OutputCol};
+use obisys::Reporter;
+use tracing::info;
+
+use super::predicate::matching_genome_indices;
+
+// ── CLI types ─────────────────────────────────────────────────────────────────
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)]
+pub enum AggOpArg {
+    Any,
+    All,
+    None,
+    Sum,
+    Min,
+    Max,
+}
+
+impl From<AggOpArg> for AggOp {
+    fn from(a: AggOpArg) -> Self {
+        match a {
+            AggOpArg::Any  => AggOp::Any,
+            AggOpArg::All  => AggOp::All,
+            AggOpArg::None => AggOp::None,
+            AggOpArg::Sum  => AggOp::Sum,
+            AggOpArg::Min  => AggOp::Min,
+            AggOpArg::Max  => AggOp::Max,
+        }
+    }
+}
+
+#[derive(Args)]
+pub struct SelectArgs {
+    /// Source index directory
+    pub source: PathBuf,
+
+    /// Output index directory (mutually exclusive with --in-place)
+    #[arg(long, conflicts_with = "in_place")]
+    pub output: Option<PathBuf>,
+
+    /// Rewrite the source index in-place (mutually exclusive with --output)
+    #[arg(long)]
+    pub in_place: bool,
+
+    /// Define a named group: `<name>:<pred>` (repeatable; mutually exclusive with --aggregate-by)
+    #[arg(long, value_name = "NAME:PRED", conflicts_with = "aggregate_by")]
+    pub group: Vec<String>,
+
+    /// Per-group aggregation operator: `<name>:<op>` (repeatable)
+    #[arg(long, value_name = "NAME:OP")]
+    pub group_op: Vec<String>,
+
+    /// Auto-create one group per unique value of metadata key <KEY>
+    #[arg(long, value_name = "KEY", conflicts_with = "group")]
+    pub aggregate_by: Option<String>,
+
+    /// Aggregation operator for all auto-generated groups
+    #[arg(long, value_name = "OP")]
+    pub aggregate_op: Option<AggOpArg>,
+
+    /// Output columns in order: group names or genome labels, comma-separated
+    #[arg(long, value_name = "COL,...", value_delimiter = ',')]
+    pub select: Option<Vec<String>>,
+
+    /// Minimum count to consider a genome as "carrying" the k-mer (logical ops only)
+    #[arg(long, default_value = "0")]
+    pub presence_threshold: u32,
+
+    /// Overwrite existing output directory
+    #[arg(short, long)]
+    pub force: bool,
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────────────
+
+fn parse_name_value(s: &str, flag: &str) -> (String, String) {
+    match s.find(':') {
+        Some(pos) => (s[..pos].trim().to_string(), s[pos + 1..].to_string()),
+        std::option::Option::None => {
+            eprintln!("error in {flag}: expected <name>:<value>, got: {s}");
+            std::process::exit(1);
+        }
+    }
+}
+
+fn parse_agg_op(s: &str) -> AggOp {
+    match s.to_lowercase().as_str() {
+        "any"  => AggOp::Any,
+        "all"  => AggOp::All,
+        "none" => AggOp::None,
+        "sum"  => AggOp::Sum,
+        "min"  => AggOp::Min,
+        "max"  => AggOp::Max,
+        other  => {
+            eprintln!("unknown aggregation operator: {other}; valid: any, all, none, sum, min, max");
+            std::process::exit(1);
+        }
+    }
+}
+
+fn default_op(src_is_count: bool) -> AggOp {
+    if src_is_count { AggOp::Sum } else { AggOp::Any }
+}
+
+// ── build_specs ───────────────────────────────────────────────────────────────
+
+/// Resolve CLI arguments into an ordered list of `OutputCol`.
+///
+/// Returns `(specs, output_presence)`.
+fn build_specs(
+    args: &SelectArgs,
+    genomes: &[GenomeInfo],
+    src_is_count: bool,
+) -> (Vec<OutputCol>, bool) {
+    // ── 1. Build group_indices: name → Vec<usize> ────────────────────────────
+    // Also keep insertion order for the default `--select *` case.
+    let mut group_order: Vec<String> = Vec::new();
+    let mut group_indices: HashMap<String, Vec<usize>> = HashMap::new();
+
+    if let Some(ref key) = args.aggregate_by {
+        // One group per unique value of `key`, in sorted order.
+        let mut value_to_indices: BTreeMap<String, Vec<usize>> = BTreeMap::new();
+        for (i, g) in genomes.iter().enumerate() {
+            if let Some(v) = g.meta.get(key) {
+                value_to_indices.entry(v.clone()).or_default().push(i);
+            }
+        }
+        for (v, idxs) in value_to_indices {
+            group_order.push(v.clone());
+            group_indices.insert(v, idxs);
+        }
+    } else {
+        for raw in &args.group {
+            let (name, pred) = parse_name_value(raw, "--group");
+            let idxs = matching_genome_indices(&pred, genomes).unwrap_or_else(|e| {
+                eprintln!("error in --group {name}: {e}");
+                std::process::exit(1);
+            });
+            if !group_indices.contains_key(&name) {
+                group_order.push(name.clone());
+            }
+            group_indices.insert(name, idxs);
+        }
+    }
+
+    // ── 2. Build per-group ops ────────────────────────────────────────────────
+    let global_op = args.aggregate_op.map(AggOp::from);
+    let mut group_op: HashMap<String, AggOp> = HashMap::new();
+    for raw in &args.group_op {
+        let (name, op_str) = parse_name_value(raw, "--group-op");
+        if !group_indices.contains_key(&name) {
+            eprintln!("--group-op references undefined group: {name}");
+            std::process::exit(1);
+        }
+        group_op.insert(name, parse_agg_op(&op_str));
+    }
+
+    // ── 3. Genome label → index map for pass-through columns ─────────────────
+    let label_to_idx: HashMap<&str, usize> = genomes.iter().enumerate()
+        .map(|(i, g)| (g.label.as_str(), i))
+        .collect();
+
+    // ── 4. Determine output column names ─────────────────────────────────────
+    let col_names: Vec<String> = if let Some(ref sel) = args.select {
+        sel.clone()
+    } else if !group_order.is_empty() {
+        group_order.clone()
+    } else {
+        // Identity: all genomes in original order
+        genomes.iter().map(|g| g.label.clone()).collect()
+    };
+
+    // ── 5. Build OutputCol list ───────────────────────────────────────────────
+    let mut specs: Vec<OutputCol> = Vec::with_capacity(col_names.len());
+
+    for name in &col_names {
+        if let Some(idxs) = group_indices.get(name) {
+            let op = group_op.get(name)
+                .copied()
+                .or(global_op)
+                .unwrap_or_else(|| default_op(src_is_count));
+            specs.push(OutputCol { label: name.clone(), indices: idxs.clone(), op });
+        } else if let Some(&idx) = label_to_idx.get(name.as_str()) {
+            // Pass-through: single-element group with default op.
+            let op = default_op(src_is_count);
+            specs.push(OutputCol { label: name.clone(), indices: vec![idx], op });
+        } else {
+            eprintln!("--select: unknown column '{name}' (not a group name or genome label)");
+            std::process::exit(1);
+        }
+    }
+
+    if specs.is_empty() {
+        eprintln!("select: no output columns defined");
+        std::process::exit(1);
+    }
+
+    // ── 6. Determine output type ──────────────────────────────────────────────
+    let output_presence = !src_is_count
+        || specs.iter().all(|s| s.op.is_logical());
+
+    (specs, output_presence)
+}
+
+// ── run ───────────────────────────────────────────────────────────────────────
+
+pub fn run(args: SelectArgs) {
+    if !args.in_place && args.output.is_none() {
+        eprintln!("error: one of --output or --in-place must be specified");
+        std::process::exit(1);
+    }
+
+    let mut src = KmerIndex::open(&args.source).unwrap_or_else(|e| {
+        eprintln!("error opening source index: {e}");
+        std::process::exit(1);
+    });
+
+    let src_is_count = src.meta().config.with_counts;
+    let (specs, output_presence) = build_specs(&args, &src.meta().genomes.clone(), src_is_count);
+
+    info!(
+        "select: {} genome(s) → {} output column(s), output={}",
+        src.meta().genomes.len(),
+        specs.len(),
+        if output_presence { "presence" } else { "count" },
+    );
+
+    let mut rep = Reporter::new();
+
+    if args.in_place {
+        src.select_in_place(&specs, args.presence_threshold, output_presence, &mut rep)
+            .unwrap_or_else(|e| {
+                eprintln!("select error: {e}");
+                std::process::exit(1);
+            });
+        rep.print();
+        info!("selected in-place → {}", args.source.display());
+    } else {
+        let output = args.output.unwrap();
+        KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force, &mut rep)
+            .unwrap_or_else(|e| {
+                eprintln!("select error: {e}");
+                std::process::exit(1);
+            });
+        rep.print();
+        info!("selected index → {}", output.display());
+    }
+}
@@ -1,10 +1,13 @@
 use std::io::{self, BufWriter, Write};
+use std::path::PathBuf;

 use clap::Args;
 use obifastwrite::write_scatter;
 use obikseq::{RoutableSuperKmer, set_k, set_m};

-use crate::cli::{CommonArgs, PipelineData, PathWithSlot, partitions_to_bits, throttle_paths};
+use obipipeline::{Throttled, throttle};
+
+use crate::cli::{CommonArgs, PipelineData, partitions_to_bits};

 #[derive(Args)]
 pub struct SuperkmerArgs {
@@ -46,14 +49,15 @@ pub fn run(args: SuperkmerArgs) {
    set_k(k);
    set_m(m);

-    let path_source = throttle_paths(args.common.seqfile_paths(), max_open);
+    let path_source = throttle(args.common.seqfile_paths(), max_open);

    let pipe = obipipeline::make_pipe! {
-        PipelineData : PathWithSlot => Vec<RoutableSuperKmer>,
+        PipelineData : Throttled<PathBuf> => Vec<RoutableSuperKmer>,
        ||? {
            let k = k;
-            move |pw: PathWithSlot| {
-                let path_str = pw.path.to_str().unwrap_or("").to_owned();
+            move |pw: Throttled<PathBuf>| {
+                let path_str = pw.item.to_str().unwrap_or("").to_owned();
+                let _guard = pw.guard;
                obiread::open_nuc_stream(&path_str, k)
            }
        } : Path => NucPage,
@@ -48,6 +48,7 @@ pub fn run(args: UnitigArgs) {
            partition
                .iter_partition_kmers(i, use_counts, n_genomes, &filters, |kmer, _row| {
                    local_g.push(kmer);
+                    true
                })
                .unwrap_or_else(|e| {
                    eprintln!("error reading partition {i}: {e}");
@@ -1,3 +1,4 @@
+use std::io::{self, Write};
 use std::path::PathBuf;

 use clap::Args;
@@ -6,20 +7,33 @@ use tracing::info;

 #[derive(Args)]
 pub struct UtilsArgs {
-    /// Index directory to operate on
-    pub index: PathBuf,
+    /// Index directories to operate on (one or more)
+    #[arg(required = true, num_args = 1..)]
+    pub indexes: Vec<PathBuf>,

-    /// Set a new genome label: NEW_LABEL=OLD_LABEL
+    /// Set a new genome label: NEW_LABEL=OLD_LABEL  (single-index only)
    #[arg(long, value_name = "NEW=OLD")]
    pub new_label: Option<String>,

-    /// Add missing layer_meta.json files to each layer (required after upgrading from old indexes)
+    /// Add missing layer_meta.json files to each layer  (single-index only)
    #[arg(long)]
    pub upgrade_index: bool,

-    /// Print bits-per-kmer statistics (MPHF, evidence, matrix, total)
+    /// Print bits-per-kmer statistics  (single-index only)
    #[arg(long)]
    pub bits_per_kmer: bool,
+
+    /// Print per-genome k-mer counts as CSV  (single-index only)
+    #[arg(long)]
+    pub stats: bool,
+
+    /// Print partition size distribution report (accepts multiple indexes)
+    #[arg(long)]
+    pub partition_stats: bool,
+
+    /// Write per-(partition, source) raw data as CSV to FILE  (used with --partition-stats)
+    #[arg(long, value_name = "FILE")]
+    pub csv: Option<PathBuf>,
 }

 pub fn run(args: UtilsArgs) {
@@ -27,25 +41,266 @@ pub fn run(args: UtilsArgs) {

    if let Some(spec) = &args.new_label {
        any = true;
-        run_rename(&args.index, spec);
+        run_rename(single_index(&args), spec);
    }

    if args.upgrade_index {
        any = true;
-        run_upgrade_index(&args.index);
+        run_upgrade_index(single_index(&args));
    }

    if args.bits_per_kmer {
        any = true;
-        run_bits_per_kmer(&args.index);
+        run_bits_per_kmer(single_index(&args));
+    }
+
+    if args.stats {
+        any = true;
+        run_stats(single_index(&args));
+    }
+
+    if args.partition_stats {
+        any = true;
+        run_partition_stats(&args.indexes, args.csv.as_deref());
    }

    if !any {
-        eprintln!("utils: no operation specified. Available options: --new-label NEW=OLD, --upgrade-index, --bits-per-kmer");
+        eprintln!(
+            "utils: no operation specified. \
+             Available: --new-label, --upgrade-index, --bits-per-kmer, --stats, --partition-stats"
+        );
        std::process::exit(1);
    }
 }

+// ── helpers ───────────────────────────────────────────────────────────────────
+
+fn single_index(args: &UtilsArgs) -> &PathBuf {
+    if args.indexes.len() > 1 {
+        eprintln!("utils: this option requires exactly one index (got {})", args.indexes.len());
+        std::process::exit(1);
+    }
+    &args.indexes[0]
+}
+
+// ── --partition-stats ─────────────────────────────────────────────────────────
+
+/// Per-partition, per-source byte count of all unitigs.bin files summed across layers.
+struct PartRow {
+    partition: usize,
+    source: String,
+    bytes: u64,
+}
+
+fn collect_rows(indexes: &[PathBuf]) -> Vec<PartRow> {
+    let mut rows = Vec::new();
+    for path in indexes {
+        let idx = KmerIndex::open(path).unwrap_or_else(|e| {
+            eprintln!("error opening index {}: {e}", path.display());
+            std::process::exit(1);
+        });
+        let name = path
+            .file_name()
+            .map(|n| n.to_string_lossy().into_owned())
+            .unwrap_or_else(|| path.display().to_string());
+        let n_parts = idx.n_partitions();
+        for i in 0..n_parts {
+            let mut bytes = 0u64;
+            for l in 0.. {
+                let p = idx.layer_unitigs_path(i, l);
+                if !p.exists() {
+                    break;
+                }
+                if let Ok(m) = std::fs::metadata(&p) {
+                    bytes += m.len();
+                }
+            }
+            rows.push(PartRow { partition: i, source: name.clone(), bytes });
+        }
+    }
+    rows
+}
+
+/// Sum bytes per partition across all sources.
+fn partition_totals(rows: &[PartRow], n_parts: usize) -> Vec<u64> {
+    let mut totals = vec![0u64; n_parts];
+    for r in rows {
+        totals[r.partition] += r.bytes;
+    }
+    totals
+}
+
+fn stats_summary(totals: &[u64]) -> (u64, u64, f64, f64, u64, u64, u64) {
+    let mut sorted = totals.to_vec();
+    sorted.sort_unstable();
+    let n = sorted.len();
+    let min = sorted[0];
+    let max = sorted[n - 1];
+    let mean = sorted.iter().sum::<u64>() as f64 / n as f64;
+    let median = if n % 2 == 0 {
+        (sorted[n / 2 - 1] + sorted[n / 2]) as f64 / 2.0
+    } else {
+        sorted[n / 2] as f64
+    };
+    let p95 = sorted[(n as f64 * 0.95) as usize];
+    let p99 = sorted[(n as f64 * 0.99) as usize];
+    let variance = sorted
+        .iter()
+        .map(|&v| (v as f64 - mean).powi(2))
+        .sum::<f64>()
+        / n as f64;
+    let std_dev = variance.sqrt();
+    (min, max, mean, median, p95, p99, std_dev as u64)
+}
+
+fn human_bytes(b: u64) -> String {
+    if b >= 1 << 30 {
+        format!("{:.1} GB", b as f64 / (1u64 << 30) as f64)
+    } else if b >= 1 << 20 {
+        format!("{:.1} MB", b as f64 / (1u64 << 20) as f64)
+    } else if b >= 1 << 10 {
+        format!("{:.1} KB", b as f64 / (1u64 << 10) as f64)
+    } else {
+        format!("{b} B")
+    }
+}
+
+fn ascii_histogram(totals: &[u64], n_buckets: usize, bar_width: usize) -> String {
+    let min = *totals.iter().min().unwrap();
+    let max = *totals.iter().max().unwrap();
+    if min == max {
+        return format!("  (all partitions identical: {})\n", human_bytes(min));
+    }
+
+    let bucket_size = (max - min).max(1) as f64 / n_buckets as f64;
+    let mut counts = vec![0usize; n_buckets];
+    for &v in totals {
+        let b = (((v - min) as f64 / bucket_size) as usize).min(n_buckets - 1);
+        counts[b] += 1;
+    }
+
+    let max_count = *counts.iter().max().unwrap();
+    let mut out = String::new();
+    for (i, &c) in counts.iter().enumerate() {
+        let lo = min + (i as f64 * bucket_size) as u64;
+        let hi = min + ((i + 1) as f64 * bucket_size) as u64;
+        let bar_len = if max_count > 0 { c * bar_width / max_count } else { 0 };
+        let bar = "█".repeat(bar_len);
+        out.push_str(&format!(
+            "  {:>8} – {:>8} │{:<width$} {}\n",
+            human_bytes(lo),
+            human_bytes(hi),
+            bar,
+            c,
+            width = bar_width
+        ));
+    }
+    out
+}
+
+fn run_partition_stats(indexes: &[PathBuf], csv_path: Option<&std::path::Path>) {
+    let rows = collect_rows(indexes);
+    if rows.is_empty() {
+        eprintln!("partition-stats: no data found");
+        std::process::exit(1);
+    }
+
+    let n_parts = rows.iter().map(|r| r.partition).max().unwrap() + 1;
+    let totals = partition_totals(&rows, n_parts);
+    let (min, max, mean, median, p95, p99, std_dev) = stats_summary(&totals);
+
+    // outliers: > median + 1.5 × IQR (approximate via > 1.5 × median as fallback)
+    let mut sorted_t = totals.clone();
+    sorted_t.sort_unstable();
+    let q1 = sorted_t[n_parts / 4] as f64;
+    let q3 = sorted_t[3 * n_parts / 4] as f64;
+    let iqr = q3 - q1;
+    let outlier_threshold = q3 + 1.5 * iqr;
+
+    let mut out = String::new();
+    out.push_str("# Partition size report\n\n");
+    out.push_str(&format!(
+        "Sources: {}  \nPartitions: {}  \n\n",
+        indexes.len(),
+        n_parts
+    ));
+
+    out.push_str("## Summary statistics (total unitigs.bin bytes per partition, sum across sources)\n\n");
+    out.push_str("| Stat | Value |\n|---|---|\n");
+    out.push_str(&format!("| min    | {} |\n", human_bytes(min)));
+    out.push_str(&format!("| max    | {} |\n", human_bytes(max)));
+    out.push_str(&format!("| mean   | {} |\n", human_bytes(mean as u64)));
+    out.push_str(&format!("| median | {} |\n", human_bytes(median as u64)));
+    out.push_str(&format!("| p95    | {} |\n", human_bytes(p95)));
+    out.push_str(&format!("| p99    | {} |\n", human_bytes(p99)));
+    out.push_str(&format!("| std    | {} |\n", human_bytes(std_dev)));
+    out.push_str(&format!("| max/median ratio | {:.2}× |\n\n", max as f64 / median));
+
+    out.push_str("## Histogram\n\n```\n");
+    out.push_str(&ascii_histogram(&totals, 30, 40));
+    out.push_str("```\n\n");
+
+    let outliers: Vec<(usize, u64)> = totals
+        .iter()
+        .enumerate()
+        .filter(|(_, v)| **v as f64 > outlier_threshold)
+        .map(|(i, v)| (i, *v))
+        .collect();
+
+    if outliers.is_empty() {
+        out.push_str("## Outliers\n\nNone (threshold: Q3 + 1.5×IQR = ");
+        out.push_str(&human_bytes(outlier_threshold as u64));
+        out.push_str(").\n");
+    } else {
+        out.push_str(&format!(
+            "## Outliers (> Q3 + 1.5×IQR = {})\n\n| Partition | Total size | Ratio to median |\n|---|---|---|\n",
+            human_bytes(outlier_threshold as u64)
+        ));
+        for (i, v) in &outliers {
+            out.push_str(&format!(
+                "| {} | {} | {:.2}× |\n",
+                i,
+                human_bytes(*v),
+                *v as f64 / median
+            ));
+        }
+        out.push('\n');
+    }
+
+    print!("{out}");
+
+    if let Some(csv_out) = csv_path {
+        let file = std::fs::File::create(csv_out).unwrap_or_else(|e| {
+            eprintln!("error creating CSV file {}: {e}", csv_out.display());
+            std::process::exit(1);
+        });
+        let mut w = io::BufWriter::new(file);
+        writeln!(w, "partition,source,bytes").unwrap();
+        for r in &rows {
+            writeln!(w, "{},{},{}", r.partition, r.source, r.bytes).unwrap();
+        }
+        eprintln!("CSV written to {}", csv_out.display());
+    }
+}
+
+// ── existing single-index operations ─────────────────────────────────────────
+
+fn run_stats(index_path: &PathBuf) {
+    let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
+        eprintln!("error opening index: {e}");
+        std::process::exit(1);
+    });
+    let (total, per_genome) = idx.genome_kmer_counts().unwrap_or_else(|e| {
+        eprintln!("error computing stats: {e}");
+        std::process::exit(1);
+    });
+    println!("genome,n_kmers");
+    for (g, &n) in idx.meta().genomes.iter().zip(per_genome.iter()) {
+        println!("{},{}", g.label, n);
+    }
+    println!("total,{total}");
+}
+
 fn run_bits_per_kmer(index_path: &PathBuf) {
    let idx = KmerIndex::open(index_path).unwrap_or_else(|e| {
        eprintln!("error opening index: {e}");
@@ -59,8 +314,10 @@ fn run_bits_per_kmer(index_path: &PathBuf) {
    println!("genomes   : {}", stats.n_genomes);
    println!("mphf      : {:6.2} bits/kmer", stats.mphf);
    println!("evidence  : {:6.2} bits/kmer", stats.evidence);
-    println!("matrix    : {:6.2} bits/kmer  ({:.2} bits/kmer/genome)",
-             stats.matrix, stats.matrix_per_genome);
+    println!(
+        "matrix    : {:6.2} bits/kmer  ({:.2} bits/kmer/genome)",
+        stats.matrix, stats.matrix_per_genome
+    );
    println!("total     : {:6.2} bits/kmer", stats.total);
 }

@@ -99,7 +356,6 @@ fn run_rename(index_path: &PathBuf, spec: &str) {
        std::process::exit(1);
    });

-    // Check the new label is not already taken.
    if idx.meta().genomes.iter().any(|g| g.label == new_label) {
        eprintln!("error: label '{new_label}' already exists in index");
        std::process::exit(1);
@@ -111,7 +367,6 @@ fn run_rename(index_path: &PathBuf, spec: &str) {
        std::process::exit(1);
    });

-    // Rename the spectrum file if it exists.
    let spectrums_dir = index_path.join("spectrums");
    let old_spectrum = spectrums_dir.join(format!("{old_label}.json"));
    let new_spectrum = spectrums_dir.join(format!("{new_label}.json"));
@@ -6,7 +6,7 @@ use clap::{Parser, Subcommand};
 use tracing_subscriber::{EnvFilter, fmt};

 #[derive(Parser)]
-#[command(name = "obikmer", about = "DNA k-mer tools")]
+#[command(name = "obikmer", about = "DNA k-mer tools", version)]
 struct Cli {
    #[command(subcommand)]
    command: Commands,
@@ -20,8 +20,10 @@ enum Commands {
    Index(cmd::index::IndexArgs),
    /// Merge multiple built indexes into one
    Merge(cmd::merge::MergeArgs),
-    /// Filter and compact an existing index into a new single-layer index
-    Rebuild(cmd::rebuild::RebuildArgs),
+    /// Apply row-level selection (σ) to an index: retain only k-mers matching the predicates
+    Filter(cmd::filter::FilterArgs),
+    /// Project and/or aggregate genome columns into a new or in-place index
+    Select(cmd::select::SelectArgs),
    /// Query an index with sequences and annotate matches
    Query(cmd::query::QueryArgs),
    /// Dump all indexed kmers as CSV (kmer + per-genome counts or presence)
@@ -65,7 +67,8 @@ fn main() {
        Commands::Index(args)     => cmd::index::run(args),
        Commands::Merge(args)     => cmd::merge::run(args),
        Commands::Dump(args)      => cmd::dump::run(args),
-        Commands::Rebuild(args)   => cmd::rebuild::run(args),
+        Commands::Filter(args)    => cmd::filter::run(args),
+        Commands::Select(args)    => cmd::select::run(args),
        Commands::Query(args)     => cmd::query::run(args),
        Commands::Annotate(args)  => cmd::annotate::run(args),
        Commands::Distance(args)  => cmd::distance::run(args),
@@ -6,16 +6,17 @@ use std::time::Instant;
 use obisys::spinner;
 use obiread::NucPage;
 use obikpartitionner::KmerPartition;
+use obipipeline::{ThrottleGuard, Throttled, throttle};
 use obisys::{Reporter, Stage};
 use tracing::info;

-use crate::cli::{PipelineData, PathWithSlot, throttle_paths};
+use crate::cli::PipelineData;

 // ── Iterator that keeps the slot guard alive until the file is exhausted ──────

 struct GuardedIter {
    inner:       Box<dyn Iterator<Item = NucPage> + Send>,
-    _guard:      Box<dyn Send + 'static>,
+    _guard:      ThrottleGuard,
    flat_active: Arc<AtomicU32>,
 }

@@ -49,7 +50,7 @@ pub fn scatter(
    use obikseq::RoutableSuperKmer;

    // Throttle in the source thread — never in a worker — to prevent deadlock.
-    let throttled = throttle_paths(path_source, max_open);
+    let throttled = throttle(path_source, max_open);

    let file_count      = Arc::new(AtomicU64::new(0));
    let flat_active     = Arc::new(AtomicU32::new(0));
@@ -57,19 +58,20 @@ pub fn scatter(

    let t = Stage::start("scatter");
    let pipe = obipipeline::make_pipe! {
-        PipelineData : PathWithSlot => Vec<RoutableSuperKmer>,
+        PipelineData : Throttled<PathBuf> => Vec<RoutableSuperKmer>,
        ||? {
            let file_count   = Arc::clone(&file_count);
            let flat_active  = Arc::clone(&flat_active);
            let k = k;
-            move |pw: PathWithSlot| {
-                let PathWithSlot { path, _guard } = pw;
+            move |pw: Throttled<PathBuf>| {
+                let path  = pw.item;
+                let guard = pw.guard;
                let n = file_count.fetch_add(1, Ordering::Relaxed) + 1;
                info!("indexing [{}]: {}", n, path.display());
                let path_str = path.to_str().unwrap_or("").to_owned();
                flat_active.fetch_add(1, Ordering::Relaxed);
                obiread::open_nuc_stream(&path_str, k)
-                    .map(|iter| GuardedIter { inner: iter, _guard, flat_active: Arc::clone(&flat_active) })
+                    .map(|iter| GuardedIter { inner: iter, _guard: guard, flat_active: Arc::clone(&flat_active) })
            }
        } : Path => NucPage,
        | {
@@ -28,4 +28,5 @@ memmap2 = "0.9.10"
 obicompactvec = { path = "../obicompactvec" }
 ptr_hash = "1.1"
 indicatif = "0.17"
-obisys    = { path = "../obisys" }
+obisys      = { path = "../obisys" }
+obipipeline = { path = "../obipipeline" }
@@ -0,0 +1,84 @@
+use std::fs;
+use std::io;
+use std::path::{Path, PathBuf};
+
+use obicompactvec::{PersistentBitVecBuilder, PersistentCompactIntVecBuilder};
+use obilayeredmap::meta::PartitionMeta;
+use obilayeredmap::{IndexMode, OLMError};
+use obiskio::{SKError, SKResult};
+
+// ── olm_to_sk ────────────────────────────────────────────────────────────────
+
+pub(crate) fn olm_to_sk(e: OLMError, context: &'static str) -> SKError {
+    match e {
+        OLMError::Io(e) => SKError::Io(e),
+        other => SKError::InvalidData {
+            context,
+            detail: other.to_string(),
+        },
+    }
+}
+
+// ── load_meta ────────────────────────────────────────────────────────────────
+
+/// Load PartitionMeta, or recover it by probing layer directories.
+/// Indexes built before meta.json was introduced lack the file.
+pub(crate) fn load_meta(dir: &Path, context: &'static str) -> SKResult<PartitionMeta> {
+    match PartitionMeta::load(dir) {
+        Ok(m) => Ok(m),
+        Err(e) if matches!(e, OLMError::Io(ref io_e) if io_e.kind() == std::io::ErrorKind::NotFound) =>
+        {
+            let mut n = 0usize;
+            while dir.join(format!("layer_{n}")).exists() {
+                n += 1;
+            }
+            let m = PartitionMeta {
+                n_layers: n,
+                mode: IndexMode::default(),
+            };
+            m.save(dir).map_err(|e| olm_to_sk(e, context))?;
+            Ok(m)
+        }
+        Err(e) => Err(olm_to_sk(e, context)),
+    }
+}
+
+// ── path helpers ──────────────────────────────────────────────────────────────
+
+pub(crate) fn col_path_bit(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pbiv"))
+}
+
+pub(crate) fn col_path_int(dir: &Path, col: usize) -> PathBuf {
+    dir.join(format!("col_{col:06}.pciv"))
+}
+
+pub(crate) fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> {
+    fs::write(
+        dir.join("meta.json"),
+        format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"),
+    )
+}
+
+// ── ColBuilder ────────────────────────────────────────────────────────────────
+
+pub(crate) enum ColBuilder {
+    Bit(PersistentBitVecBuilder),
+    Int(PersistentCompactIntVecBuilder),
+}
+
+impl ColBuilder {
+    pub(crate) fn set_val(&mut self, slot: usize, value: u32) {
+        match self {
+            ColBuilder::Bit(b) => b.set(slot, value > 0),
+            ColBuilder::Int(b) => b.set(slot, value),
+        }
+    }
+
+    pub(crate) fn close(self) -> SKResult<()> {
+        match self {
+            ColBuilder::Bit(b) => b.close().map_err(SKError::Io),
+            ColBuilder::Int(b) => b.close().map_err(SKError::Io),
+        }
+    }
+}
@@ -26,17 +26,19 @@ impl KmerPartition {
    /// If no data matrix exists for a layer (pure set-membership, single genome),
    /// a row of `n_genomes` ones is emitted for every kmer in that layer — unless
    /// the filter rejects it, in which case the whole layer is skipped.
+    /// Like [`iter_partition_kmers`] but the callback returns `false` to stop early.
+    /// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
    pub fn iter_partition_kmers(
        &self,
        part: usize,
        use_counts: bool,
        n_genomes: usize,
        filters: &[Box<dyn KmerFilter>],
-        mut cb: impl FnMut(CanonicalKmer, Box<[u32]>),
-    ) -> SKResult<()> {
+        mut cb: impl FnMut(CanonicalKmer, Box<[u32]>) -> bool,
+    ) -> SKResult<bool> {
        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
        if !index_dir.exists() {
-            return Ok(());
+            return Ok(true);
        }

        let index_mode = PartitionMeta::load(&index_dir)
@@ -54,56 +56,68 @@ impl KmerPartition {
            let counts_dir   = layer_dir.join("counts");
            let presence_dir = layer_dir.join("presence");

-            if use_counts && counts_dir.exists() {
+            let cont = if use_counts && counts_dir.exists() {
                let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
+                let mut cont = true;
                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                    if let Some(slot) = mphf.find(kmer) {
                        let row = mat.row(slot);
                        if passes_all(filters, &row, n_genomes) {
-                            cb(kmer, row);
+                            cont = cb(kmer, row);
+                            if !cont { break; }
                        }
                    }
                }
+                cont
            } else if !use_counts && presence_dir.exists() {
                let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
+                let mut cont = true;
                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                    if let Some(slot) = mphf.find(kmer) {
                        let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
                        if passes_all(filters, &row, n_genomes) {
-                            cb(kmer, row);
+                            cont = cb(kmer, row);
+                            if !cont { break; }
                        }
                    }
                }
+                cont
            } else {
                // No data matrix: implicit presence — all values are 1.
                // The filter result is identical for every kmer, so evaluate once.
                let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
+                let mut cont = true;
                if passes_all(filters, &all_present, n_genomes) {
                    for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                        if mphf.find(kmer).is_some() {
-                            cb(kmer, all_present.clone());
+                            cont = cb(kmer, all_present.clone());
+                            if !cont { break; }
                        }
                    }
                }
-            }
+                cont
+            };
+
+            if !cont { return Ok(false); }
        }

-        Ok(())
+        Ok(true)
    }

    /// Like [`iter_partition_kmers`] but the callback also receives `(partition, layer)`
    /// indices, enabling debug output that identifies where each kmer was stored.
+    /// Returns `Ok(true)` if all kmers were visited, `Ok(false)` if the callback halted.
    pub fn iter_partition_kmers_located(
        &self,
        part: usize,
        use_counts: bool,
        n_genomes: usize,
        filters: &[Box<dyn KmerFilter>],
-        mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>),
-    ) -> SKResult<()> {
+        mut cb: impl FnMut(usize, usize, CanonicalKmer, Box<[u32]>) -> bool,
+    ) -> SKResult<bool> {
        let index_dir = self.part_dir(part).join(INDEX_SUBDIR);
        if !index_dir.exists() {
-            return Ok(());
+            return Ok(true);
        }

        let index_mode = PartitionMeta::load(&index_dir)
@@ -120,39 +134,50 @@ impl KmerPartition {
            let counts_dir   = layer_dir.join("counts");
            let presence_dir = layer_dir.join("presence");

-            if use_counts && counts_dir.exists() {
+            let cont = if use_counts && counts_dir.exists() {
                let mat = PersistentCompactIntMatrix::open(&layer_dir).map_err(SKError::Io)?;
+                let mut cont = true;
                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                    if let Some(slot) = mphf.find(kmer) {
                        let row = mat.row(slot);
                        if passes_all(filters, &row, n_genomes) {
-                            cb(part, layer, kmer, row);
+                            cont = cb(part, layer, kmer, row);
+                            if !cont { break; }
                        }
                    }
                }
+                cont
            } else if !use_counts && presence_dir.exists() {
                let mat = PersistentBitMatrix::open(&layer_dir).map_err(SKError::Io)?;
+                let mut cont = true;
                for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                    if let Some(slot) = mphf.find(kmer) {
                        let row: Box<[u32]> = mat.row(slot).iter().map(|&b| b as u32).collect();
                        if passes_all(filters, &row, n_genomes) {
-                            cb(part, layer, kmer, row);
+                            cont = cb(part, layer, kmer, row);
+                            if !cont { break; }
                        }
                    }
                }
+                cont
            } else {
                let all_present: Box<[u32]> = vec![1u32; n_genomes].into();
+                let mut cont = true;
                if passes_all(filters, &all_present, n_genomes) {
                    for (kmer, _, _) in reader.iter_indexed_canonical_kmers() {
                        if mphf.find(kmer).is_some() {
-                            cb(part, layer, kmer, all_present.clone());
+                            cont = cb(part, layer, kmer, all_present.clone());
+                            if !cont { break; }
                        }
                    }
                }
-            }
+                cont
+            };
+
+            if !cont { return Ok(false); }
            layer += 1;
        }

-        Ok(())
+        Ok(true)
    }
 }
--- a/Show More
+++ b/Show More