diff --git a/.DS_Store b/.DS_Store index 2b2a356..96c32f7 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..7c2b524 --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: ['**'] + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + defaults: + run: + working-directory: src + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + src/target + key: ${{ runner.os }}-cargo-${{ hashFiles('src/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo- + + - name: Build + run: cargo build --release + + - name: Test + run: cargo test --release diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml new file mode 100644 index 0000000..b60d4b2 --- /dev/null +++ b/.gitea/workflows/release.yml @@ -0,0 +1,48 @@ +name: Release + +on: + push: + tags: + - 'v*' + +jobs: + build-linux-static: + runs-on: ubuntu-latest + defaults: + run: + working-directory: src + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + musl target + run: | + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable + echo "$HOME/.cargo/bin" >> $GITHUB_PATH + sudo apt-get update -qq && sudo apt-get install -y -qq musl-tools + $HOME/.cargo/bin/rustup target add x86_64-unknown-linux-musl + + - name: Cache cargo registry + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + src/target + key: linux-musl-cargo-${{ hashFiles('src/Cargo.lock') }} + restore-keys: linux-musl-cargo- + + - name: Build static binary + run: cargo build --release --target x86_64-unknown-linux-musl + + - name: Prepare artifact + run: | + mkdir -p /tmp/dist + cp target/x86_64-unknown-linux-musl/release/obikmer /tmp/dist/obikmer-linux-x86_64 + strip /tmp/dist/obikmer-linux-x86_64 + + - name: Upload release asset + uses: actions/upload-artifact@v4 + with: + name: obikmer-linux-x86_64 + path: /tmp/dist/obikmer-linux-x86_64 + if-no-files-found: error diff --git a/.gitignore b/.gitignore index 76d17de..ec94743 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,13 @@ data-stress ./**/*.json *.bin Betula_exilis--IGA-24-33 +benchmark/genomes +benchmark/simulated_data +benchmark/specimen_index_presence +benchmark/specimen_index_count +benchmark/global_index_presence +benchmark/global_index_count +benchmark/stats +benchmark/reference_index +benchmark/specific_index_count +benchmark/specific_index_presence diff --git a/.serena/.gitignore b/.serena/.gitignore new file mode 100644 index 0000000..2e510af --- /dev/null +++ b/.serena/.gitignore @@ -0,0 +1,2 @@ +/cache +/project.local.yml diff --git a/.serena/project.yml b/.serena/project.yml new file mode 100644 index 0000000..1a35e2f --- /dev/null +++ b/.serena/project.yml @@ -0,0 +1,133 @@ +# the name by which the project can be referenced within Serena +project_name: "obikmer" + + +# list of languages for which language servers are started; choose from: +# al angular ansible bash clojure +# cpp cpp_ccls crystal csharp csharp_omnisharp +# dart elixir elm erlang fortran +# fsharp go groovy haskell haxe +# hlsl html java json julia +# kotlin lean4 lua luau markdown +# matlab msl nix ocaml pascal +# perl php php_phpactor powershell python +# python_jedi python_ty r rego ruby +# ruby_solargraph rust scala scss solidity +# svelte swift systemverilog terraform toml +# typescript typescript_vts vue yaml zig +# (This list may be outdated. For the current list, see values of Language enum here: +# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py +# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) +# Note: +# - For C, use cpp +# - For JavaScript, use typescript +# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root) +# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm) +# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three) +# - For Free Pascal/Lazarus, use pascal +# Special requirements: +# Some languages require additional setup/installations. +# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers +# When using multiple languages, the first language server that supports a given file will be used for that file. +# The first language is the default language and the respective language server will be used as a fallback. +# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. +languages: +- rust + +# the encoding used by text files in the project +# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +encoding: "utf-8" + +# line ending convention to use when writing source files. +# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default) +# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings. +line_ending: + +# The language backend to use for this project. +# If not set, the global setting from serena_config.yml is used. +# Valid values: LSP, JetBrains +# Note: the backend is fixed at startup. If a project with a different backend +# is activated post-init, an error will be returned. +language_backend: + +# whether to use project's .gitignore files to ignore files +ignore_all_files_in_gitignore: true + +# advanced configuration option allowing to configure language server-specific options. +# Maps the language key to the options. +# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available. +# No documentation on options means no options are available. +ls_specific_settings: {} + +# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos). +# Paths can be absolute or relative to the project root. +# Each folder is registered as an LSP workspace folder, enabling language servers to discover +# symbols and references across package boundaries. +# Currently supported for: TypeScript. +# Example: +# additional_workspace_folders: +# - ../sibling-package +# - ../shared-lib +additional_workspace_folders: [] + +# list of additional paths to ignore in this project. +# Same syntax as gitignore, so you can use * and **. +# Note: global ignored_paths from serena_config.yml are also applied additively. +ignored_paths: [] + +# whether the project is in read-only mode +# If set to true, all editing tools will be disabled and attempts to use them will result in an error +# Added on 2025-04-18 +read_only: false + +# list of tool names to exclude. +# This extends the existing exclusions (e.g. from the global configuration) +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +excluded_tools: [] + +# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default). +# This extends the existing inclusions (e.g. from the global configuration). +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +included_optional_tools: [] + +# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools. +# This cannot be combined with non-empty excluded_tools or included_optional_tools. +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +fixed_tools: [] + +# list of mode names that are to be activated by default, overriding the setting in the global configuration. +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply. +# Otherwise, this overrides the setting from the global configuration (serena_config.yml). +# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply +# for this project. +# This setting can, in turn, be overridden by CLI parameters (--mode). +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +default_modes: + +# list of mode names to be activated additionally for this project, e.g. ["query-projects"] +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +added_modes: + +# initial prompt for the project. It will always be given to the LLM upon activating the project +# (contrary to the memories, which are loaded on demand). +initial_prompt: "" + +# time budget (seconds) per tool call for the retrieval of additional symbol information +# such as docstrings or parameter information. +# This overrides the corresponding setting in the global configuration; see the documentation there. +# If null or missing, use the setting from the global configuration. +symbol_info_budget: + +# list of regex patterns which, when matched, mark a memory entry as read‑only. +# Extends the list from the global configuration, merging the two lists. +read_only_memory_patterns: [] + +# list of regex patterns for memories to completely ignore. +# Matching memories will not appear in list_memories or activate_project output +# and cannot be accessed via read_memory or write_memory. +# To access ignored memory files, use the read_file tool on the raw file path. +# Extends the list from the global configuration, merging the two lists. +# Example: ["_archive/.*", "_episodes/.*"] +ignored_memory_patterns: [] diff --git a/CLAUDE.md b/CLAUDE.md index 6fa8412..c6cac5a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s --- Je continue à poser mes questions et à guider la discussion. + +--- + +## MCP Tools + +**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.** + +### Hiérarchie des outils pour ce projet Rust + +**Navigation et édition de code → serena en priorité** +- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations` +- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols` +- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file` +- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview` +- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body` +- Ne pas utiliser `cclsp` quand serena couvre le besoin + +**Analyse architecturale → jcodemunch** +- Hotspots, couplage, dead code, dépendances entre modules +- Utiliser avant de refactorer une zone critique + +**Raisonnement complexe → sequential-thinking** +- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux + +**Documentation de crates → context7** +- Toujours consulter avant d'utiliser une API de bibliothèque externe diff --git a/Makefile b/Makefile index e203e6a..04942bf 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate mkdocs mkdocs-material \ mkdocs-mermaid2-plugin \ mkdocs-bibtex + $(PIP) install --quiet --upgrade InSilicoSeq # ── obikmer binary ─────────────────────────────────────────────────────────── @@ -62,3 +63,28 @@ clean-doc: .PHONY: clean clean: clean-doc rm -rf $(VENV) + +# ── release ─────────────────────────────────────────────────────────────────── + +CARGO_TOML := $(CARGO_DIR)/obikmer/Cargo.toml + +.PHONY: bump-version +bump-version: + @current=$$(grep '^version = ' $(CARGO_TOML) | head -n 1 | sed 's/version = "\(.*\)"/\1/'); \ + if [ -n "$(RELEASE)" ]; then \ + new_version="$(RELEASE)"; \ + else \ + major=$$(echo $$current | cut -d. -f1); \ + minor=$$(echo $$current | cut -d. -f2); \ + patch=$$(echo $$current | cut -d. -f3); \ + new_patch=$$((patch + 1)); \ + new_version="$$major.$$minor.$$new_patch"; \ + fi; \ + echo "Version: $$current -> $$new_version"; \ + sed -i.bak "s/^version = \"$$current\"/version = \"$$new_version\"/" $(CARGO_TOML) && \ + rm $(CARGO_TOML).bak + +.PHONY: release +release: bump-version + @jj auto-describe + @jj git push --change @ diff --git a/benchmark/Makefile b/benchmark/Makefile new file mode 100644 index 0000000..5654ecc --- /dev/null +++ b/benchmark/Makefile @@ -0,0 +1,144 @@ +# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS +BINARY := ../src/target/release/obikmer +VENV_PY := ../.venv/bin/python3 + +GENOMES := $(wildcard genomes/*.fna.gz) + +# SPECIMENS, SPECIES, and the full dependency graph are generated by +# make_deps.py from the genome FASTA headers — like .d files in C. +# Make rebuilds deps.mk whenever genomes/ changes and restarts. +-include deps.mk + +REF_NPZS := $(SPECIMENS:%=reference_index/%.npz) +PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done) +PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats) +COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done) +COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats) +VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats) +VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats) +SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done) +SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats) +SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done) +SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats) +SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz) + +.NOTPARALLEL: + +.PHONY: all simulate reference \ + index_presence index_count \ + aggregate_index_presence aggregate_index_count \ + merge_presence merge_count \ + verify_presence verify_count \ + aggregate_verify_presence aggregate_verify_count \ + verify_merge_presence verify_merge_count \ + filter_presence filter_count \ + aggregate_filter_presence aggregate_filter_count + +verify_merge_presence: stats/verify_merge_presence/current.csv +verify_merge_count: stats/verify_merge_count/current.csv + +all: aggregate_verify_presence aggregate_verify_count \ + verify_merge_presence verify_merge_count \ + aggregate_filter_presence aggregate_filter_count + +# ── dependency file ─────────────────────────────────────────────────────────── + +deps.mk: $(GENOMES) + $(VENV_PY) make_deps.py $^ > $@ + +# ── simulation ──────────────────────────────────────────────────────────────── +# Prerequisites (genome → reads) are in deps.mk; $< is the genome file. + +$(SIMULATED_READS): + bash simulate_one.sh $< $(dir $@) + +simulate: $(SIMULATED_READS) + +# ── reference kmer sets ─────────────────────────────────────────────────────── +# Prerequisites (reads → npz) are in deps.mk. + +reference_index/%.npz: + bash build_reference.sh $* + +reference: $(REF_NPZS) + +# ── per-specimen indexing ───────────────────────────────────────────────────── +# Prerequisites (reads → index.done + .stats) are in deps.mk. + +specimen_index_presence/%/index.done \ +stats/indexing_presence/%.stats &: $(BINARY) + bash index_one_presence.sh $* + +specimen_index_count/%/index.done \ +stats/indexing_count/%.stats &: $(BINARY) + bash index_one_count.sh $* + +index_presence: $(PRESENCE_DONE) +index_count: $(COUNT_DONE) + +# ── indexing stats aggregation ──────────────────────────────────────────────── + +aggregate_index_presence: $(PRESENCE_STATS) + bash aggregate_stats.sh indexing_presence + +aggregate_index_count: $(COUNT_STATS) + bash aggregate_stats.sh indexing_count + +# ── global merge ────────────────────────────────────────────────────────────── + +global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY) + bash merge_presence.sh + +global_index_count/index.done: $(COUNT_DONE) $(BINARY) + bash merge_count.sh + +merge_presence: global_index_presence/index.done +merge_count: global_index_count/index.done + +# ── per-specimen verification ───────────────────────────────────────────────── +# Prerequisites (index.done + npz → .stats) are in deps.mk. + +stats/verify_presence/%.stats: + bash verify_one_presence.sh $* + +stats/verify_count/%.stats: + bash verify_one_count.sh $* + +verify_presence: $(VERIFY_PRESENCE_STATS) +verify_count: $(VERIFY_COUNT_STATS) + +# ── verification stats aggregation ─────────────────────────────────────────── + +aggregate_verify_presence: $(VERIFY_PRESENCE_STATS) + bash aggregate_stats.sh verify_presence + +aggregate_verify_count: $(VERIFY_COUNT_STATS) + bash aggregate_stats.sh verify_count + +# ── species-specific indexes ────────────────────────────────────────────────── +# Prerequisites (global index → specific index) are in deps.mk. + +specific_index_presence/%/index.done \ +stats/specific_kmer_presence/%.stats &: $(BINARY) + bash filter_one_presence.sh $* + +specific_index_count/%/index.done \ +stats/specific_kmer_count/%.stats &: $(BINARY) + bash filter_one_count.sh $* + +filter_presence: $(SPECIFIC_PRESENCE_DONE) +filter_count: $(SPECIFIC_COUNT_DONE) + +aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS) + bash aggregate_stats.sh specific_kmer_presence + +aggregate_filter_count: $(SPECIFIC_COUNT_STATS) + bash aggregate_stats.sh specific_kmer_count + +# ── merged index verification ───────────────────────────────────────────────── + +stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done + bash verify_merge_presence.sh + +stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done + bash verify_merge_count.sh diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..04ad741 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,132 @@ +# Benchmark pipeline + +Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`. + +``` +gmake all # full pipeline +gmake simulate # simulation only +gmake reference # reference kmer sets only +``` + +## Pipeline overview + +```mermaid +flowchart TD + GENOMES["genomes/*.fna.gz"] + BIN["obikmer binary"] + + GENOMES --> simulate + simulate --> simdata[("simulated_data/")] + + simdata --> reference + reference --> refnpz[("reference_index/*.npz")] + + subgraph presence ["Presence track"] + simdata --> index_presence + BIN --> index_presence + index_presence --> pres_done[("specimen_index_presence/")] + index_presence --> pres_istats[("stats/indexing_presence/")] + pres_istats --> aggregate_index_presence + + pres_done --> merge_presence + BIN --> merge_presence + merge_presence --> gpres[("global_index_presence/")] + + refnpz --> verify_presence + pres_done --> verify_presence + verify_presence --> vpres_stats[("stats/verify_presence/")] + vpres_stats --> aggregate_verify_presence + + gpres --> filter_presence + BIN --> filter_presence + filter_presence --> spec_pres[("specific_index_presence/")] + filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")] + spec_pres_stats --> aggregate_filter_presence + + refnpz --> verify_merge_presence + gpres --> verify_merge_presence + verify_merge_presence --> vmp[("stats/verify_merge_presence/")] + end + + subgraph count ["Count track"] + simdata --> index_count + BIN --> index_count + index_count --> count_done[("specimen_index_count/")] + index_count --> count_istats[("stats/indexing_count/")] + count_istats --> aggregate_index_count + + count_done --> merge_count + BIN --> merge_count + merge_count --> gcount[("global_index_count/")] + + refnpz --> verify_count + count_done --> verify_count + verify_count --> vcount_stats[("stats/verify_count/")] + vcount_stats --> aggregate_verify_count + + gcount --> filter_count + BIN --> filter_count + filter_count --> spec_count[("specific_index_count/")] + filter_count --> spec_count_stats[("stats/specific_kmer_count/")] + spec_count_stats --> aggregate_filter_count + + refnpz --> verify_merge_count + gcount --> verify_merge_count + verify_merge_count --> vmc[("stats/verify_merge_count/")] + end + + aggregate_verify_presence --> all + aggregate_verify_count --> all + vmp --> all + vmc --> all + all -. "$(MAKE) re-eval" .-> aggregate_filter_presence + all -. "$(MAKE) re-eval" .-> aggregate_filter_count +``` + +## Steps + +| Target | Script | Description | +|---|---|---| +| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes | +| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth | +| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) | +| `index_count` | `index_one_count.sh` | Index each specimen (count mode) | +| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) | +| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) | +| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index | +| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index | +| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference | +| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference | +| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) | +| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) | +| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index | +| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index | +| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) | +| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) | +| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets | +| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets | + +## Directory layout + +``` +benchmark/ +├── genomes/ # input reference genomes (.fna.gz) +├── simulated_data/ # generated by simulate +│ └── // +├── reference_index/ # reference kmer sets (.npz) +├── specimen_index_presence/ # per-specimen presence indexes +├── specimen_index_count/ # per-specimen count indexes +├── global_index_presence/ # merged global presence index +├── global_index_count/ # merged global count index +├── specific_index_presence/ # species-specific presence indexes +├── specific_index_count/ # species-specific count indexes +└── stats/ # all benchmark statistics + ├── indexing_presence/ + ├── indexing_count/ + ├── verify_presence/ + ├── verify_count/ + ├── specific_kmer_presence/ + ├── specific_kmer_count/ + ├── verify_merge_presence/ + └── verify_merge_count/ +``` diff --git a/benchmark/aggregate_stats.sh b/benchmark/aggregate_stats.sh new file mode 100755 index 0000000..19901bb --- /dev/null +++ b/benchmark/aggregate_stats.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Usage: aggregate_stats.sh TYPE +# TYPE = indexing_presence | indexing_count | verify_presence | verify_count +# +# Reads all stats/TYPE/*.stats files (one CSV data row each, no header). +# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than +# the most recent run CSV (idempotent when nothing changed). +set -euo pipefail + +TYPE="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}" + +case "${TYPE}" in + indexing_presence|indexing_count) + HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b" + ;; + verify_presence) + HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct" + ;; + verify_count) + HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct" + ;; + specific_kmer_presence|specific_kmer_count) + HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b" + ;; + *) + echo "ERROR: unknown stats type '${TYPE}'" >&2 + exit 1 + ;; +esac + +# Find most recent existing run CSV (empty string if none). +latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1) + +# Check if any .stats file is newer than the latest run CSV. +if [[ -n "${latest_csv}" ]] && \ + [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then + echo "[${TYPE}] stats up to date (${latest_csv})" + exit 0 +fi + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +echo "${HEADER}" >"${CSV}" + +# Sort .stats files by name for reproducible row order. +while IFS= read -r stats_file; do + sed "s/^/${run_n},/" "${stats_file}" +done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}" + +echo "[${TYPE}] run ${run_n} → ${CSV}" diff --git a/benchmark/build_reference.py b/benchmark/build_reference.py new file mode 100755 index 0000000..eddd3da --- /dev/null +++ b/benchmark/build_reference.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +"""Build a reference kmer index from paired-end FASTQ reads. + +Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 — +counts their abundances, and saves a sorted numpy pair (kmers, counts). + +Output .npz arrays + kmers : uint64, sorted ascending — canonical kmer integers + counts : uint32, same order — raw read abundances +""" +import argparse +import gzip +import sys +from collections import defaultdict + +import numpy as np + + +# ── encoding ──────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +# Lookup table: revcomp of one byte (4 bases, 8 bits). +# Precomputed once at import time. +_REVCOMP8 = [0] * 256 +for _i in range(256): + _rc, _x = 0, _i + for _ in range(4): + _rc = (_rc << 2) | (3 - (_x & 3)) + _x >>= 2 + _REVCOMP8[_i] = _rc +del _i, _rc, _x + + +def revcomp_int(kmer: int, k: int) -> int: + """Reverse-complement of a kmer encoded as an integer (2 bits/base). + + Uses byte-level lookup (4 bases at a time) for speed. + """ + rc = 0 + bits_left = 2 * k + while bits_left > 0: + chunk = min(8, bits_left) + rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk) + rc = (rc << chunk) | rc_byte + kmer >>= chunk + bits_left -= chunk + return rc + + +# ── FASTQ parsing ──────────────────────────────────────────────────────────── + +def iter_sequences(path: str): + """Yield raw sequences from a (gzipped) FASTQ file.""" + opener = gzip.open if path.endswith('.gz') else open + with opener(path, 'rt') as fh: + while True: + if not fh.readline(): # '@' header + break + seq = fh.readline().rstrip('\n') + fh.readline() # '+' + fh.readline() # quality + yield seq + + +# ── kmer counting ──────────────────────────────────────────────────────────── + +def count_kmers(paths: list[str], k: int) -> dict[int, int]: + mask = (1 << (2 * k)) - 1 + counts: dict[int, int] = defaultdict(int) + n_reads = 0 + + for path in paths: + for seq in iter_sequences(path): + n_reads += 1 + kmer = 0 + run = 0 # consecutive valid bases + + for c in seq: + b = _ENCODE.get(c) + if b is None: # N or unexpected character → reset + kmer = 0 + run = 0 + continue + kmer = ((kmer << 2) | b) & mask + run += 1 + if run >= k: + rc = revcomp_int(kmer, k) + counts[kmer if kmer <= rc else rc] += 1 + + if n_reads % 100_000 == 0: + print(f' {n_reads:,} reads processed, ' + f'{len(counts):,} distinct kmers so far', + file=sys.stderr) + + print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers', + file=sys.stderr) + return counts + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reads', nargs='+', metavar='FASTQ', + help='Input reads (FASTQ, gzip OK)') + ap.add_argument('-k', '--kmer-size', type=int, default=31, + metavar='K') + ap.add_argument('--min-abundance', type=int, default=1, + metavar='N', help='Drop kmers with count < N (default 1)') + ap.add_argument('-o', '--output', required=True, + metavar='FILE', help='Output .npz path') + args = ap.parse_args() + + print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr) + counts = count_kmers(args.reads, args.kmer_size) + + if args.min_abundance > 1: + before = len(counts) + counts = {k: v for k, v in counts.items() if v >= args.min_abundance} + print(f' min-abundance={args.min_abundance}: ' + f'{before - len(counts):,} kmers dropped, ' + f'{len(counts):,} retained', + file=sys.stderr) + + print(f'Sorting and saving → {args.output}', file=sys.stderr) + kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts)) + counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32) + + np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr) + print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/benchmark/build_reference.sh b/benchmark/build_reference.sh new file mode 100755 index 0000000..3d312c1 --- /dev/null +++ b/benchmark/build_reference.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SIMDATA_DIR="${SCRIPT_DIR}/simulated_data" +REF_DIR="${SCRIPT_DIR}/reference_index" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +BUILD_PY="${SCRIPT_DIR}/build_reference.py" + +KMER_SIZE="${KMER_SIZE:-31}" +MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}" + +mkdir -p "${REF_DIR}" + +for species_dir in "${SIMDATA_DIR}"/*/; do + [[ -d "${species_dir}" ]] || continue + species=$(basename "${species_dir}") + + for strain_dir in "${species_dir}"*/; do + [[ -d "${strain_dir}" ]] || continue + strain=$(basename "${strain_dir}") + + r1="${strain_dir}/reads_R1.fastq.gz" + r2="${strain_dir}/reads_R2.fastq.gz" + if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "SKIP ${species}--${strain}: reads not found" >&2 + continue + fi + + out="${REF_DIR}/${species}--${strain}.npz" + echo "[${species}--${strain}] → ${out}" + + "${PYTHON}" "${BUILD_PY}" \ + --kmer-size "${KMER_SIZE}" \ + --min-abundance "${MIN_ABUNDANCE}" \ + --output "${out}" \ + "${r1}" "${r2}" + done +done diff --git a/benchmark/deps.mk b/benchmark/deps.mk new file mode 100644 index 0000000..031dd59 --- /dev/null +++ b/benchmark/deps.mk @@ -0,0 +1,199 @@ +SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2 +SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris + +# Escherichia_coli--K-12_MG1655 +simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz +reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done +stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done + +# Escherichia_coli--EDL933 +simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz +reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done +stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done + +# Salmonella_enterica--LT2 +simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz +reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done +stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done + +# Escherichia_coli--CFT073 +simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz +reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done +stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done + +# Bacillus_subtilis--168 +simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz +reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done +stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done + +# Salmonella_enterica--P125109 +simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz +reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done +stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done + +# Shouchella_clausii--KSM-K16 +simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz +reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done +stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done + +# Escherichia_coli--K-12_W3110 +simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz +reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done +stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done + +# Klebsiella_pneumoniae--MGH_78578 +simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done +stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done + +# Opitutus_terrae--PB90-1 +simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz +reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done +stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done + +# Saccharolobus_islandicus--M.16.4 +simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz +reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done +stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done + +# Acidobacterium_capsulatum--ATCC_51196 +simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz +reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done +stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done + +# Salmonella_enterica--AKU_12601 +simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz +reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done +stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done + +# Proteus_mirabilis--HI4320 +simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz +reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done +stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done + +# Salmonella_enterica--CT18 +simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz +reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done +stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done + +# Klebsiella_pneumoniae--HS11286 +simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done +stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done + +# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 +simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz +reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done +stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done + +# Klebsiella_pneumoniae--ATCC_13883 +simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done +stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done + +# Yersinia_ruckeri--YRB +simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz +reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done +stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done + +# Candidozyma_auris--GCF_003013715.1_ASM301371v2 +simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz +reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done +stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done + +# Escherichia_coli +specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done +specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done +# Salmonella_enterica +specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done +specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done +# Bacillus_subtilis +specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done +specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done +# Shouchella_clausii +specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done +specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done +# Klebsiella_pneumoniae +specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done +specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done +# Opitutus_terrae +specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done +specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done +# Saccharolobus_islandicus +specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done +specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done +# Acidobacterium_capsulatum +specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done +specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done +# Proteus_mirabilis +specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done +specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done +# Wolbachia_endosymbiont +specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done +specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done +# Yersinia_ruckeri +specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done +specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done +# Candidozyma_auris +specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done +specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done diff --git a/benchmark/downloads.sh b/benchmark/downloads.sh new file mode 100755 index 0000000..d86111e --- /dev/null +++ b/benchmark/downloads.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -euo pipefail + +assemblies=( + GCF_000005845.2 + GCF_000010245.2 + GCF_000007445.1 + GCF_000006665.1 + + GCF_000006945.2 + GCF_000195995.1 + GCF_000009505.1 + GCF_000026565.1 + + GCF_000016305.1 + GCF_000019965.1 + GCF_000240185.1 + GCF_000742135.1 + + GCF_000069965.1 + GCF_000022565.1 + GCF_000306885.1 + GCF_003013715.1 + + GCF_000009045.1 + GCF_000009825.1 + GCF_000022445.1 + GCF_000834255.1 +) + +mkdir -p genomes + +for acc in "${assemblies[@]}"; do + echo "Downloading ${acc}" + + datasets download genome accession "${acc}" \ + --include genome \ + --filename "${acc}.zip" + + unzip -q "${acc}.zip" -d "${acc}" + find "${acc}" -name "*.fna" | + while read file; do + obiconvert -Z ${file} >genomes/$(basename ${file}).gz + done + + rm -rf "${acc}" "${acc}.zip" +done diff --git a/benchmark/filter_one_count.sh b/benchmark/filter_one_count.sh new file mode 100755 index 0000000..115ed3c --- /dev/null +++ b/benchmark/filter_one_count.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# Usage: filter_one_count.sh SPECIES +# Filters global_index_count to keep only kmers specific to SPECIES, +# then selects the SPECIES column in-place. +# Outputs: +# specific_index_count/SPECIES/index.done (written by obikmer select) +# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header) +set -euo pipefail + +SPECIES="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +SOURCE="${SCRIPT_DIR}/global_index_count" +OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}" +STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count" +STATS_FILE="${STATS_DIR}/${SPECIES}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIES}] filter (count) → ${OUTPUT}" + +LOG_FILTER=$(mktemp) +LOG_SELECT=$(mktemp) +trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT + +"${BINARY}" filter \ + --output "${OUTPUT}" \ + --force \ + --ingroup "species=${SPECIES}" \ + --outgroup all \ + --min-frac 0.5 \ + --max-frac 1.0 \ + --max-outgroup-count 0 \ + "${SOURCE}" \ + 2>"${LOG_FILTER}" + +cat "${LOG_FILTER}" >&2 + +"${BINARY}" select \ + --in-place \ + --group "${SPECIES}:species=${SPECIES}" \ + --group-op "${SPECIES}:any" \ + --select "${SPECIES}" \ + "${OUTPUT}" \ + 2>"${LOG_SELECT}" + +cat "${LOG_SELECT}" >&2 + +python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +def parse_reporter(logfile): + stats = {} + state = 'scan' + with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats['TOTAL'] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + return stats + +f = parse_reporter(log_filter) +s = parse_reporter(log_select) + +row = [species] +for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]: + key = 'TOTAL' if stage.endswith('_total') else stage + w, r = d.get(key, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +print(','.join(row)) +PYEOF diff --git a/benchmark/filter_one_presence.sh b/benchmark/filter_one_presence.sh new file mode 100755 index 0000000..12099ce --- /dev/null +++ b/benchmark/filter_one_presence.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# Usage: filter_one_presence.sh SPECIES +# Filters global_index_presence to keep only kmers specific to SPECIES, +# then selects the SPECIES column in-place. +# Outputs: +# specific_index_presence/SPECIES/index.done (written by obikmer select) +# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header) +set -euo pipefail + +SPECIES="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +SOURCE="${SCRIPT_DIR}/global_index_presence" +OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}" +STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence" +STATS_FILE="${STATS_DIR}/${SPECIES}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIES}] filter (presence) → ${OUTPUT}" + +LOG_FILTER=$(mktemp) +LOG_SELECT=$(mktemp) +trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT + +"${BINARY}" filter \ + --output "${OUTPUT}" \ + --force \ + --ingroup "species=${SPECIES}" \ + --outgroup all \ + --min-frac 0.5 \ + --max-frac 1.0 \ + --max-outgroup-count 0 \ + "${SOURCE}" \ + 2>"${LOG_FILTER}" + +cat "${LOG_FILTER}" >&2 + +"${BINARY}" select \ + --in-place \ + --group "${SPECIES}:species=${SPECIES}" \ + --group-op "${SPECIES}:any" \ + --select "${SPECIES}" \ + "${OUTPUT}" \ + 2>"${LOG_SELECT}" + +cat "${LOG_SELECT}" >&2 + +python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +def parse_reporter(logfile): + stats = {} + state = 'scan' + with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats['TOTAL'] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + return stats + +f = parse_reporter(log_filter) +s = parse_reporter(log_select) + +row = [species] +for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]: + key = 'TOTAL' if stage.endswith('_total') else stage + w, r = d.get(key, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +print(','.join(row)) +PYEOF diff --git a/benchmark/index_one_count.sh b/benchmark/index_one_count.sh new file mode 100755 index 0000000..325ec7f --- /dev/null +++ b/benchmark/index_one_count.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Usage: index_one_count.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Outputs: +# specimen_index_count/SPECIMEN/index.done (written by obikmer) +# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}" +INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/indexing_count" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +r1="${READS_DIR}/reads_R1.fastq.gz" +r2="${READS_DIR}/reads_R2.fastq.gz" +if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "ERROR: reads not found in ${READS_DIR}" >&2 + exit 1 +fi + +echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" index \ + --output "${INDEX_PATH}" \ + --force \ + --theta 0 \ + --with-counts \ + --label "${SPECIMEN}" \ + --meta "species=${species}" \ + "${r1}" "${r2}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 + +python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index'] +row = [species, strain] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF diff --git a/benchmark/index_one_presence.sh b/benchmark/index_one_presence.sh new file mode 100755 index 0000000..029c537 --- /dev/null +++ b/benchmark/index_one_presence.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Usage: index_one_presence.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Outputs: +# specimen_index_presence/SPECIMEN/index.done (written by obikmer) +# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}" +INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +r1="${READS_DIR}/reads_R1.fastq.gz" +r2="${READS_DIR}/reads_R2.fastq.gz" +if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "ERROR: reads not found in ${READS_DIR}" >&2 + exit 1 +fi + +echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" index \ + --output "${INDEX_PATH}" \ + --force \ + --theta 0 \ + --label "${SPECIMEN}" \ + --meta "species=${species}" \ + "${r1}" "${r2}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 + +python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index'] +row = [species, strain] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF diff --git a/benchmark/make_deps.py b/benchmark/make_deps.py new file mode 100644 index 0000000..03f7e2a --- /dev/null +++ b/benchmark/make_deps.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Generate deps.mk — pure dependency declarations for the benchmark pipeline. + +Like C .d files: only target: prerequisites lines, no recipes. +Recipes stay in the Makefile as generic rules. +""" +import gzip +import re +import sys +from pathlib import Path + +STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome', + 'endosymbiont', 'of'} +STOP_PREFIXES = ('scaffold', 'contig', 'plasmid') + + +def is_stop(tok): + t = tok.lower() + return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES) + + +def sanitize(s): + return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_') + + +def collect_tokens(text): + parts = [] + for tok in text.split(): + tok = tok.rstrip(',.') + if is_stop(tok): + break + parts.append(sanitize(tok)) + return '_'.join(filter(None, parts)) + + +def parse_organism(defn, gcf_id): + words = defn.split() + species = sanitize(words[0] + '_' + words[1]) + + m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn) + if m: + strain = sanitize(m.group(1)) + if m.group(2): + strain += '_' + sanitize(m.group(2)) + return species, strain + + m = re.search(r'\bstrain\b\s+(.*)', defn) + if m: + strain = collect_tokens(m.group(1)) + if strain: + return species, strain + + remainder = re.sub(r'^\S+ \S+\s*', '', defn) + remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder) + remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder) + strain = collect_tokens(remainder) + return species, strain if strain else gcf_id + + +def first_definition(path): + with gzip.open(path, 'rt') as fh: + for line in fh: + if line.startswith('>'): + m = re.search(r'"definition":"([^"]*)"', line) + return m.group(1) if m else line[1:].split()[0] + return Path(path).stem + + +def main(): + entries = [] # (specimen, species, sim_dir, genome_path) + species_seen = [] + + for path in sorted(sys.argv[1:]): + gcf_id = Path(path).name.replace('_genomic.fna.gz', '') + defn = first_definition(path) + sp, st = parse_organism(defn, gcf_id) + specimen = f'{sp}--{st}' + sim_dir = f'simulated_data/{sp}/{st}' + entries.append((specimen, sp, sim_dir, path)) + if sp not in species_seen: + species_seen.append(sp) + + specimens = [e[0] for e in entries] + print('SPECIMENS :=', ' '.join(specimens)) + print('SPECIES :=', ' '.join(species_seen)) + + for specimen, species, sim_dir, genome in entries: + reads = f'{sim_dir}/reads_R1.fastq.gz' + p_done = f'specimen_index_presence/{specimen}/index.done' + p_stats = f'stats/indexing_presence/{specimen}.stats' + c_done = f'specimen_index_count/{specimen}/index.done' + c_stats = f'stats/indexing_count/{specimen}.stats' + ref = f'reference_index/{specimen}.npz' + vp = f'stats/verify_presence/{specimen}.stats' + vc = f'stats/verify_count/{specimen}.stats' + + print() + print(f'# {specimen}') + print(f'{reads}: {genome}') + print(f'{ref}: {reads}') + print(f'{p_done} {p_stats}: {reads}') + print(f'{c_done} {c_stats}: {reads}') + print(f'{vp}: {ref} {p_done}') + print(f'{vc}: {ref} {c_done}') + + print() + for sp in species_seen: + sp_done = f'specific_index_presence/{sp}/index.done' + sp_stats = f'stats/specific_kmer_presence/{sp}.stats' + sc_done = f'specific_index_count/{sp}/index.done' + sc_stats = f'stats/specific_kmer_count/{sp}.stats' + print(f'# {sp}') + print(f'{sp_done} {sp_stats}: global_index_presence/index.done') + print(f'{sc_done} {sc_stats}: global_index_count/index.done') + + +if __name__ == '__main__': + main() diff --git a/benchmark/merge_count.sh b/benchmark/merge_count.sh new file mode 100755 index 0000000..871b436 --- /dev/null +++ b/benchmark/merge_count.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +IDX_DIR="${SCRIPT_DIR}/specimen_index_count" +OUTPUT="${SCRIPT_DIR}/global_index_count" +STATS_DIR="${SCRIPT_DIR}/stats/merge_count" + +mkdir -p "${STATS_DIR}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}" + +parse_reporter() { + local run="$1" n_sources="$2" logfile="$3" + python3 - "$run" "$n_sources" "$logfile" <<'PYEOF' +import sys, re + +run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): + state = 'rows' + elif state == 'rows': + if is_sep(s): + state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack'] +row = [run, n_sources] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF +} + +mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort) + +if [[ ${#sources[@]} -eq 0 ]]; then + echo "ERROR: no indexes found in ${IDX_DIR}" >&2 + exit 1 +fi + +echo "Merging ${#sources[@]} count indexes → ${OUTPUT}" +printf ' %s\n' "${sources[@]}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" merge \ + --output "${OUTPUT}" \ + --force \ + "${sources[@]}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 +parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}" + +echo "Done. Run ${run_n} → ${CSV}" diff --git a/benchmark/merge_presence.sh b/benchmark/merge_presence.sh new file mode 100755 index 0000000..7a816d1 --- /dev/null +++ b/benchmark/merge_presence.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +IDX_DIR="${SCRIPT_DIR}/specimen_index_presence" +OUTPUT="${SCRIPT_DIR}/global_index_presence" +STATS_DIR="${SCRIPT_DIR}/stats/merge_presence" + +mkdir -p "${STATS_DIR}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}" + +parse_reporter() { + local run="$1" n_sources="$2" logfile="$3" + python3 - "$run" "$n_sources" "$logfile" <<'PYEOF' +import sys, re + +run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): + state = 'rows' + elif state == 'rows': + if is_sep(s): + state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack'] +row = [run, n_sources] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF +} + +mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort) + +if [[ ${#sources[@]} -eq 0 ]]; then + echo "ERROR: no indexes found in ${IDX_DIR}" >&2 + exit 1 +fi + +echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}" +printf ' %s\n' "${sources[@]}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" merge \ + --output "${OUTPUT}" \ + --force \ + --force-presence \ + "${sources[@]}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 +parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}" + +echo "Done. Run ${run_n} → ${CSV}" diff --git a/benchmark/simulate.sh b/benchmark/simulate.sh new file mode 100755 index 0000000..c486255 --- /dev/null +++ b/benchmark/simulate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Simulate all genomes. Delegates to simulate_one.sh per genome. +# Prefer running via `gmake simulate` which handles individual dependencies. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do + out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \ + --dir-for "${genome_file}") + bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}" +done diff --git a/benchmark/simulate_one.sh b/benchmark/simulate_one.sh new file mode 100644 index 0000000..d4c4c1a --- /dev/null +++ b/benchmark/simulate_one.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Usage: simulate_one.sh genome.fna.gz output_dir +# Simulates paired-end HiSeq reads for a single genome. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ISS="${SCRIPT_DIR}/../.venv/bin/iss" +COVERAGE=15 +READ_LENGTH=150 +CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}" + +genome_file="$1" +out_dir="$2" + +mkdir -p "${out_dir}" + +tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna") +trap 'rm -f "${tmp_fasta}"' EXIT + +gzip -dc "${genome_file}" > "${tmp_fasta}" + +genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ') +n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))") + +echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)" + +"${ISS}" generate \ + --genomes "${tmp_fasta}" \ + --model HiSeq \ + --n_reads "${n_reads}" \ + --cpus "${CPUS}" \ + --compress \ + --output "${out_dir}/reads" diff --git a/benchmark/verify_count.py b/benchmark/verify_count.py new file mode 100755 index 0000000..0b204e0 --- /dev/null +++ b/benchmark/verify_count.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Compare an obikmer count index against a reference kmer set (presence + counts). + +Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py), +streams `obikmer dump` from a --with-counts index, then reports: + - false negatives : kmers in reference absent from the index + - false positives : kmers in the index absent from the reference + - count mismatches: kmers present in both but with differing counts + +Output to stdout: one CSV row + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch, + fn_pct,fp_pct,cm_pct +""" +import argparse +import subprocess +import sys + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── dump parsing ────────────────────────────────────────────────────────────── + +def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]: + """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32).""" + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + kmers, counts = [], [] + header = True + for line in proc.stdout: + if header: + header = False + continue + parts = line.rstrip('\n').split(',') + kmers.append(encode_kmer(parts[0])) + counts.append(int(parts[1])) + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable') + return (np.array(kmers, dtype=np.uint64)[order], + np.array(counts, dtype=np.uint32)[order]) + + +# ── comparison ──────────────────────────────────────────────────────────────── + +def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray, + idx_kmers: np.ndarray, idx_counts: np.ndarray, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts). + + All arrays sorted; cm_* cover kmers present in both arrays but with + differing counts. + """ + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + # Count mismatches among shared kmers. + # Both arrays are sorted so we can use searchsorted. + pos_in_idx = np.searchsorted(idx_kmers, ref_kmers) + pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1) + shared_mask = idx_kmers[pos_in_idx] == ref_kmers + + shared_ref_counts = ref_counts[shared_mask] + shared_idx_counts = idx_counts[pos_in_idx[shared_mask]] + mismatch_mask = shared_ref_counts != shared_idx_counts + + cm_kmers = ref_kmers[shared_mask][mismatch_mask] + cm_ref_counts = shared_ref_counts[mismatch_mask] + cm_idx_counts = shared_idx_counts[mismatch_mask] + + return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reference', metavar='REF_NPZ', nargs='?', + help='Reference .npz file') + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='obikmer index directory (built with --with-counts)') + ap.add_argument('--obikmer', default='obikmer', + help='Path to obikmer binary') + ap.add_argument('--species', default='') + ap.add_argument('--strain', default='') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fp', metavar='FILE', + help='Save false-positive kmer strings to FILE') + ap.add_argument('--save-fn', metavar='FILE', + help='Save false-negative kmer strings to FILE') + ap.add_argument('--save-cm', metavar='FILE', + help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,count_mismatch,' + 'fn_pct,fp_pct,cm_pct') + return + + # Detect k + cmd1 = [args.obikmer, 'dump', '--head', '1', args.index] + out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + # Load reference + print(f'Loading reference: {args.reference}', file=sys.stderr) + npz = np.load(args.reference) + ref_kmers = npz['kmers'] # sorted uint64 + ref_counts = npz['counts'] # uint32 + + # Load index + print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr) + idx_kmers, idx_counts = load_index(args.obikmer, args.index) + + print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr) + + false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare( + ref_kmers, ref_counts, idx_kmers, idx_counts) + + n_shared = len(ref_kmers) - len(false_neg) + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0 + + print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr) + print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr) + print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)', + file=sys.stderr) + + if args.save_fn and len(false_neg): + with open(args.save_fn, 'w') as fh: + for v in false_neg: + fh.write(decode_kmer(int(v), k) + '\n') + + if args.save_fp and len(false_pos): + with open(args.save_fp, 'w') as fh: + for v in false_pos: + fh.write(decode_kmer(int(v), k) + '\n') + + if args.save_cm and len(cm_kmers): + with open(args.save_cm, 'w') as fh: + fh.write('kmer,ref_count,idx_count\n') + for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx): + fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n') + + print(f'{args.species},{args.strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},' + f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}') + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_count.py b/benchmark/verify_merge_count.py new file mode 100755 index 0000000..72518a1 --- /dev/null +++ b/benchmark/verify_merge_count.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""Verify the merged count index against all per-specimen reference sets. + +Streams `obikmer dump` once on the merged index, accumulates per-specimen +kmer+count pairs from each column, then compares each against its reference .npz. + +Output to stdout: one CSV row per specimen (same columns as verify_count.py) + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch, + fn_pct,fp_pct,cm_pct +""" +import argparse +import subprocess +import sys +from pathlib import Path + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── single-pass dump ────────────────────────────────────────────────────────── + +def stream_merged_dump(obikmer_bin: str, index_dir: str, + ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]: + """Stream the merged dump once. + + Returns: + specimen_names : column labels in dump order + per_specimen : mapping label → (kmer_ints, counts) for entries > 0 + """ + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + + header_line = proc.stdout.readline().rstrip('\n') + cols = header_line.split(',') + specimen_names = cols[1:] + per_specimen: dict[str, tuple[list[int], list[int]]] = { + name: ([], []) for name in specimen_names} + + for line in proc.stdout: + parts = line.rstrip('\n').split(',') + kmer_int = encode_kmer(parts[0]) + for i, name in enumerate(specimen_names): + count = int(parts[i + 1]) + if count > 0: + per_specimen[name][0].append(kmer_int) + per_specimen[name][1].append(count) + + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + + return specimen_names, per_specimen + + +# ── per-specimen comparison ─────────────────────────────────────────────────── + +def compare_specimen(name: str, + kmer_list: list[int], + count_list: list[int], + ref_dir: Path, + k: int, + save_fn: Path | None, + save_fp: Path | None, + save_cm: Path | None, + ) -> str: + ref_path = ref_dir / f'{name}.npz' + if not ref_path.exists(): + print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr) + return '' + + species = name.split('--')[0] + strain = name[len(species) + 2:] + + npz = np.load(ref_path) + ref_kmers = npz['kmers'] # sorted uint64 + ref_counts = npz['counts'] # uint32 + + order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable') + idx_kmers = np.array(kmer_list, dtype=np.uint64)[order] + idx_counts = np.array(count_list, dtype=np.uint32)[order] + + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + # Count mismatches among shared kmers + pos_in_idx = np.searchsorted(idx_kmers, ref_kmers) + pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1) + shared_mask = idx_kmers[pos_in_idx] == ref_kmers + mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]] + cm_kmers = ref_kmers[shared_mask][mismatch_mask] + cm_ref = ref_counts[shared_mask][mismatch_mask] + cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask] + + n_shared = int(shared_mask.sum()) + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0 + + print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} ' + f'fn={len(false_neg):,} ({fn_pct:.4f}%) ' + f'fp={len(false_pos):,} ({fp_pct:.4f}%) ' + f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)', + file=sys.stderr) + + if save_fn and len(false_neg): + fn_file = save_fn / f'{name}_fn.txt' + fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n') + + if save_fp and len(false_pos): + fp_file = save_fp / f'{name}_fp.txt' + fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n') + + if save_cm and len(cm_kmers): + cm_file = save_cm / f'{name}_cm.csv' + lines = ['kmer,ref_count,idx_count'] + for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx): + lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}') + cm_file.write_text('\n'.join(lines) + '\n') + + return (f'{species},{strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},' + f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}') + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='Merged count index directory') + ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?', + help='Directory containing per-specimen .npz reference files') + ap.add_argument('--obikmer', default='obikmer') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fn', metavar='DIR', + help='Directory for false-negative kmer lists') + ap.add_argument('--save-fp', metavar='DIR', + help='Directory for false-positive kmer lists') + ap.add_argument('--save-cm', metavar='DIR', + help='Directory for count-mismatch CSV files') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,count_mismatch,' + 'fn_pct,fp_pct,cm_pct') + return + + ref_dir = Path(args.ref_dir) + save_fn = Path(args.save_fn) if args.save_fn else None + save_fp = Path(args.save_fp) if args.save_fp else None + save_cm = Path(args.save_cm) if args.save_cm else None + for d in (save_fn, save_fp, save_cm): + if d: d.mkdir(parents=True, exist_ok=True) + + out1 = subprocess.check_output( + [args.obikmer, 'dump', '--head', '1', args.index], + stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr) + specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index) + print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr) + + for name in specimen_names: + kmers, counts = per_specimen[name] + row = compare_specimen(name, kmers, counts, ref_dir, k, + save_fn, save_fp, save_cm) + if row: + print(row) + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_count.sh b/benchmark/verify_merge_count.sh new file mode 100755 index 0000000..ebf4c36 --- /dev/null +++ b/benchmark/verify_merge_count.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +INDEX="${SCRIPT_DIR}/global_index_count" +REF_DIR="${SCRIPT_DIR}/reference_index" +STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py" + +mkdir -p "${STATS_DIR}" + +CURRENT="${STATS_DIR}/current.csv" + +"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + "${INDEX}" "${REF_DIR}" \ + >>"${CURRENT}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')") +ARCHIVE="${STATS_DIR}/count_${run_n}.csv" +cp "${CURRENT}" "${ARCHIVE}" + +echo "Done. Results → ${ARCHIVE}" diff --git a/benchmark/verify_merge_presence.py b/benchmark/verify_merge_presence.py new file mode 100755 index 0000000..66fc12c --- /dev/null +++ b/benchmark/verify_merge_presence.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""Verify the merged presence index against all per-specimen reference sets. + +Streams `obikmer dump` once on the merged index, accumulates per-specimen +kmer sets from each column, then compares each against its reference .npz. + +Output to stdout: one CSV row per specimen (same columns as verify_presence.py) + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct +""" +import argparse +import subprocess +import sys +from pathlib import Path + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── single-pass dump ────────────────────────────────────────────────────────── + +def stream_merged_dump(obikmer_bin: str, index_dir: str, + ) -> tuple[list[str], dict[str, list[int]]]: + """Stream the merged dump once. + + Returns: + specimen_names : column labels in dump order (excluding 'kmer') + per_specimen : mapping label → list of kmer ints where presence > 0 + """ + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + + header_line = proc.stdout.readline().rstrip('\n') + cols = header_line.split(',') + specimen_names = cols[1:] # first col is 'kmer' + per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names} + + for line in proc.stdout: + parts = line.rstrip('\n').split(',') + kmer_int = encode_kmer(parts[0]) + for i, name in enumerate(specimen_names): + if int(parts[i + 1]) > 0: + per_specimen[name].append(kmer_int) + + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + + return specimen_names, per_specimen + + +# ── per-specimen comparison ─────────────────────────────────────────────────── + +def compare_specimen(name: str, + kmer_list: list[int], + ref_dir: Path, + k: int, + save_fn: Path | None, + save_fp: Path | None, + ) -> str: + """Compare one specimen column against its reference .npz. + + Returns a CSV row string. + """ + ref_path = ref_dir / f'{name}.npz' + if not ref_path.exists(): + print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr) + return '' + + species = name.split('--')[0] + strain = name[len(species) + 2:] + + ref_kmers = np.load(ref_path)['kmers'] # sorted uint64 + idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64) + + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + + print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} ' + f'fn={len(false_neg):,} ({fn_pct:.4f}%) ' + f'fp={len(false_pos):,} ({fp_pct:.4f}%)', + file=sys.stderr) + + if save_fn and len(false_neg): + fn_file = save_fn / f'{name}_fn.txt' + fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n') + + if save_fp and len(false_pos): + fp_file = save_fp / f'{name}_fp.txt' + fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n') + + return (f'{species},{strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},' + f'{fn_pct:.4f},{fp_pct:.4f}') + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='Merged presence index directory') + ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?', + help='Directory containing per-specimen .npz reference files') + ap.add_argument('--obikmer', default='obikmer') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fn', metavar='DIR', + help='Directory to save false-negative kmer lists') + ap.add_argument('--save-fp', metavar='DIR', + help='Directory to save false-positive kmer lists') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,fn_pct,fp_pct') + return + + ref_dir = Path(args.ref_dir) + save_fn = Path(args.save_fn) if args.save_fn else None + save_fp = Path(args.save_fp) if args.save_fp else None + if save_fn: save_fn.mkdir(parents=True, exist_ok=True) + if save_fp: save_fp.mkdir(parents=True, exist_ok=True) + + # Detect k + out1 = subprocess.check_output( + [args.obikmer, 'dump', '--head', '1', args.index], + stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr) + specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index) + print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr) + + for name in specimen_names: + row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp) + if row: + print(row) + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_presence.sh b/benchmark/verify_merge_presence.sh new file mode 100755 index 0000000..bea5ddf --- /dev/null +++ b/benchmark/verify_merge_presence.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +INDEX="${SCRIPT_DIR}/global_index_presence" +REF_DIR="${SCRIPT_DIR}/reference_index" +STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py" + +mkdir -p "${STATS_DIR}" + +CURRENT="${STATS_DIR}/current.csv" + +"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + "${INDEX}" "${REF_DIR}" \ + >>"${CURRENT}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')") +ARCHIVE="${STATS_DIR}/presence_${run_n}.csv" +cp "${CURRENT}" "${ARCHIVE}" + +echo "Done. Results → ${ARCHIVE}" diff --git a/benchmark/verify_one_count.sh b/benchmark/verify_one_count.sh new file mode 100755 index 0000000..3dfb8d6 --- /dev/null +++ b/benchmark/verify_one_count.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Usage: verify_one_count.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_count.py" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz" +INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/verify_count" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIMEN}] verifying count" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + --species "${species}" \ + --strain "${strain}" \ + "${REF_NPZ}" "${INDEX_DIR}" \ + >"${STATS_FILE}" diff --git a/benchmark/verify_one_presence.sh b/benchmark/verify_one_presence.sh new file mode 100755 index 0000000..252a2c3 --- /dev/null +++ b/benchmark/verify_one_presence.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Usage: verify_one_presence.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_presence.py" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz" +INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/verify_presence" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIMEN}] verifying presence" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + --species "${species}" \ + --strain "${strain}" \ + "${REF_NPZ}" "${INDEX_DIR}" \ + >"${STATS_FILE}" diff --git a/benchmark/verify_presence.py b/benchmark/verify_presence.py new file mode 100755 index 0000000..7041dd5 --- /dev/null +++ b/benchmark/verify_presence.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +"""Compare an obikmer index against a reference kmer set (presence/absence). + +Loads the reference .npz (sorted uint64 kmers built by build_reference.py), +streams the output of `obikmer dump`, encodes each kmer string to uint64, +then reports false negatives and false positives using numpy set operations. + +Output to stdout: one CSV row + species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct +""" +import argparse +import subprocess +import sys + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── dump parsing ────────────────────────────────────────────────────────────── + +def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray: + """Stream `obikmer dump` and return a sorted uint64 array of kmer integers.""" + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + kmers = [] + header = True + for line in proc.stdout: + if header: + header = False + continue + kmer_str = line.split(',', 1)[0] + kmers.append(encode_kmer(kmer_str)) + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + arr = np.array(kmers, dtype=np.uint64) + arr.sort() + return arr + + +# ── comparison ──────────────────────────────────────────────────────────────── + +def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Return (false_negatives, false_positives) as uint64 arrays.""" + false_neg = np.setdiff1d(ref, idx, assume_unique=True) + false_pos = np.setdiff1d(idx, ref, assume_unique=True) + return false_neg, false_pos + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file') + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory') + ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary') + ap.add_argument('--species', default='', help='Species label for CSV row') + ap.add_argument('--strain', default='', help='Strain label for CSV row') + ap.add_argument('--header', action='store_true', help='Print CSV header and exit') + ap.add_argument('--save-fp', metavar='FILE', + help='Save false-positive kmer strings to FILE') + ap.add_argument('--save-fn', metavar='FILE', + help='Save false-negative kmer strings to FILE') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,fn_pct,fp_pct') + return + + # Detect k from the index (one cheap call before the full dump). + cmd1 = [args.obikmer, 'dump', '--head', '1', args.index] + out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + # Load reference + print(f'Loading reference: {args.reference}', file=sys.stderr) + npz = np.load(args.reference) + ref_kmers = npz['kmers'] # already sorted uint64 + + # Load index + print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr) + idx_kmers = load_index_kmers(args.obikmer, args.index) + + print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr) + + false_neg, false_pos = compare(ref_kmers, idx_kmers) + + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + + print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr) + print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr) + + if args.save_fn and len(false_neg): + with open(args.save_fn, 'w') as fh: + for v in false_neg: + fh.write(decode_kmer(int(v), k) + '\n') + print(f'False negatives saved → {args.save_fn}', file=sys.stderr) + + if args.save_fp and len(false_pos): + with open(args.save_fp, 'w') as fh: + for v in false_pos: + fh.write(decode_kmer(int(v), k) + '\n') + print(f'False positives saved → {args.save_fp}', file=sys.stderr) + + print(f'{args.species},{args.strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},' + f'{fn_pct:.4f},{fp_pct:.4f}') + + +if __name__ == '__main__': + main() diff --git a/docmd/architecture/rebuild_filter.md b/docmd/architecture/rebuild_filter.md new file mode 100644 index 0000000..443aa75 --- /dev/null +++ b/docmd/architecture/rebuild_filter.md @@ -0,0 +1,105 @@ +# Rebuild / filter — column-first design + +## Problem with the current two-pass design + +`rebuild_partition` currently makes **two full passes** over source data: + +**Pass 1** — read unitigs → MPHF lookup (source) → read row (108 values) → apply filter → push kmer into `GraphDeBruijn`, **discard row**. + +**Pass 2** — read unitigs again → MPHF lookup again → read row again → for each passing kmer, look up slot in new MPHF → fill column builders. + +Both passes do random access into the source matrix: for each kmer, the MPHF returns a slot, then we read 108 values scattered across 108 column positions. This is cache-hostile even with a packed matrix (`.pbmx`), because the matrix is column-major: consecutive row reads jump across the file. + +## Memory budget + +The `keep` bitvector costs **1 bit per slot**. With 256 partitions and realistic kmer counts, each partition holds at most a few tens of millions of slots → a few MB per bitvector. Even in the absolute worst case (800 M slots), it stays under 100 MB. This is negligible. + +The `slot_map` option (Option B, 8–16 bytes per slot) is heavier but still bounded: at 15 M slots and 8 bytes, that is 120 MB per partition, acceptable for a single worker. + +## Key observation + +**The filter operates on column values, not on kmers.** A filter like `--max-outgroup-count 0` only needs to know, for each slot, whether any outgroup column is non-zero. It does not need to know which kmer occupies that slot. + +This means filtering can be done as a **sequential column scan** that produces a `keep: BitVec[n_slots]` — no MPHF lookups, no kmer knowledge, perfectly cache-friendly. + +## Proposed single-scan design + +### Step 1 — column scan → `keep` bitvector + +``` +for each column c in source matrix: + read column c sequentially (one mmap range) + update keep[slot] according to filter contribution of column c +``` + +For `GroupQuorumFilter` with ingroup/outgroup: +- ingroup columns: count presence per slot → `ingroup_count[slot]` +- outgroup columns: `keep[slot] &= (value[slot] == 0)` (early-exit possible) + +Result: `keep: BitVec` of size `n_slots`, computed with purely sequential IO. + +### Step 2 — unitig scan → kept kmers + new MPHF + +``` +for each kmer in unitig files: + old_slot = old_MPHF(kmer) + if keep[old_slot]: + push kmer into new GraphDeBruijn + record (old_slot, kmer) ← or just old_slot in order +``` + +Build new MPHF from `GraphDeBruijn` via `materialize_layer`. + +### Step 3 — fill new matrix + +Two sub-options: + +**Option A — from recorded (old_slot, kmer) pairs:** + +``` +for each (old_slot, kmer) in recorded list: + new_slot = new_MPHF(kmer) + for each column c: + new_matrix[new_slot, c] = old_matrix[old_slot, c] +``` + +Memory cost: `n_kept × (8 + 8)` bytes for `(old_slot: usize, kmer: CanonicalKmer)`. +For species-specific filters, `n_kept` is small. For unfiltered rebuild, `n_kept = n_slots`. + +**Option B — column-by-column copy using old→new slot mapping:** + +Precompute `slot_map: Vec>` of size `n_slots`: +- For each kmer in unitig file: `slot_map[old_MPHF(kmer)] = Some(new_MPHF(kmer))` + +Then for each source column: +``` +read source column sequentially +for each slot where slot_map[slot] = Some(new_slot): + write value to new column at new_slot +``` + +Memory cost: `n_slots × sizeof(usize)` for the slot map (one usize per source slot). +IO pattern: sequential read of each source column → random write into new column builders. + +Option B avoids storing kmer values and works uniformly regardless of filter selectivity. + +## Comparison + +| | Current | Proposed | +|---|---|---| +| Disk reads | 2× unitigs + 2× random matrix | 1× columns (sequential) + 1× unitigs | +| MPHF lookups (source) | 2× N_kmers | 1× N_kept (step 2) or 0 (option B, col scan only) | +| Cache behavior | poor (random row access) | good (sequential column scan) | +| Extra memory | none | slot_map (option B) or (old_slot, kmer) list (option A) | + +## Files to modify + +- `src/obikpartitionner/src/rebuild_layer.rs` — `rebuild_partition` and `iter_src_layers` +- Possibly `src/obicompactvec/` — add column iterator API if not already present +- `src/obilayeredmap/` — check if per-column sequential access is exposed on `SrcLayerData` + +## Open questions + +- Does `SrcLayerData` expose per-column sequential iteration, or only `lookup(kmer, n_genomes)` random access? +- For option B: are new column builders writable in random-slot order (i.e. `set_val(slot, value)` without sequential constraint)? +- For `GroupQuorumFilter` specifically: can the filter be decomposed into independent per-column contributions, or does it need the full row? diff --git a/docmd/implementation/filtering.md b/docmd/implementation/filtering.md index 4dfab31..ea6d4a2 100644 --- a/docmd/implementation/filtering.md +++ b/docmd/implementation/filtering.md @@ -29,16 +29,23 @@ Multiple values separated by `|` are always OR-ed within the predicate. ### Path matching (`~` and `!~`) -Metadata values can represent hierarchical taxonomic paths such as +Metadata values can represent hierarchical concept paths such as `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`. -- **Absolute pattern** (starts with `/`): the value must start with the pattern - at a segment boundary. - `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and - `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`. -- **Bare segment** (no leading `/`): the value must contain the pattern as an - exact path component anywhere. - `taxon~Betula` matches any path that has `Betula` as one of its segments. +Stored taxonomy values always start with `/` (the root of the path). +Query patterns do **not** need to start with `/` — a leading `/` is an optional +start anchor, not a requirement. + +| Pattern form | Semantics | +|---|---| +| `A/B` | contiguous sub-path A then B, anywhere in the value | +| `/A/B` | value starts with A then B | +| `A/B$` | value ends with A then B | +| `/A/B$` | value is exactly A then B | +| `A@x/B` | A with class `x` followed by B with any class | + +- `taxon~/Betulaceae/Betula` matches any path that starts with `Betulaceae` then `Betula`. +- `taxon~Betula` matches any path containing `Betula` as a segment, anywhere. ### Missing metadata key → NA diff --git a/docmd/implementation/obicompactvec.md b/docmd/implementation/obicompactvec.md new file mode 100644 index 0000000..301b021 --- /dev/null +++ b/docmd/implementation/obicompactvec.md @@ -0,0 +1,520 @@ +# obicompactvec — Complete Reference + +## Module structure + +``` +src/obicompactvec/src/ + lib.rs public re-exports + views.rs BitSliceView<'a>, IntSliceView<'a> — zero-copy read views + traits.rs ColumnWeights, CountPartials, BitPartials (matrix aggregation) + bitvec.rs PersistentBitVec, PersistentBitVecBuilder, BitIter + reader.rs PersistentCompactIntVec (read-only) + builder.rs PersistentCompactIntVecBuilder (read-write) + tempintvec.rs TempCompactIntVec, TempCompactIntVecBuilder (temp-file-backed) + tempbitvec.rs TempBitVec, TempBitVecBuilder (temp-file-backed) + bitmatrix.rs PersistentBitMatrix, PersistentBitMatrixBuilder + intmatrix.rs PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder + colgroup.rs ColGroup, MatrixGroupOps trait + format.rs file format constants, encode/decode helpers + layer_meta.rs LayerMeta (column metadata) + meta.rs matrix metadata +``` + +```mermaid +graph TD + views --> bitvec + views --> builder + views --> tempbitvec + views --> tempintvec + views --> bitmatrix + views --> intmatrix + format --> reader + format --> builder + reader --> intmatrix + reader --> tempintvec + builder --> intmatrix + builder --> tempintvec + bitvec --> tempbitvec + bitvec --> bitmatrix + tempintvec --> intmatrix + tempintvec --> bitmatrix + tempbitvec --> intmatrix + tempbitvec --> bitmatrix + colgroup --> intmatrix + colgroup --> bitmatrix + layer_meta --> bitmatrix + layer_meta --> intmatrix + meta --> bitmatrix + meta --> intmatrix +``` + +--- + +## Compact int encoding + +All integer vectors use the same two-tier encoding regardless of storage backend. + +**Primary array** — one `u8` per slot: + +- Values **0–254** are stored directly. No overhead. +- Value **255 is a sentinel**: the slot's actual value is ≥ 255 and lives in the overflow store. + +**Overflow store** — maps slot index to a `u32` value ≥ 255: + +- In `PersistentCompactIntVecBuilder`: a `HashMap` in RAM. +- In `PersistentCompactIntVec` (reader): a sorted `[(slot: u64, value: u32)]` array in the mmap, with a sparse L1-resident index for binary search. + +```mermaid +flowchart LR + slot --> P["primary[slot]: u8"] + P -->|"< 255"| V["value = byte (0–254)"] + P -->|"= 255 sentinel"| OV["overflow store"] + OV -->|"Builder"| HM["HashMap<usize, u32>\nin RAM"] + OV -->|"PersistentCompactIntVec"| SA["sorted [(slot,value)] in mmap\n+ sparse L1 index"] +``` + +**Key property — sentinel 255 = +∞ on `u8`:** + +- `min(a, 255) = a` for all `a ≤ 254` → correct when only one side is overflow +- `max(a, 255) = 255` → correct sentinel when either side is overflow +- Only the **both-overflow** case requires reading actual values from the overflow store. + +In practice, k (overflow count) ≪ n (total slots). Observed genomic data: ~0.07% of kmer slots are in overflow. + +--- + +## View types + +The previous trait hierarchy (`BitSlice`, `BitSliceMut`, `IntSlice`, `IntSliceMut`) has been replaced by two concrete zero-copy view structs with inherent methods. Views are **`Copy`** — passing them is free. All read operations live on these two types. + +### `BitSliceView<'a>` + +```rust +#[derive(Clone, Copy)] +pub struct BitSliceView<'a> { pub(crate) words: &'a [u64], pub(crate) n: usize } +``` + +Bit `i` is at `words[i >> 6]` bit `i & 63` (LSB-first). Padding bits in the last word are zero. + +| Method | Cost | +|---|---| +| `len()`, `is_empty()` | O(1) | +| `get(slot)` | O(1) | +| `count_ones()` | POPCNT per word, O(n/64) | +| `count_zeros()` | `n − count_ones()`, O(n/64) | +| `iter() -> BitSliceIter<'a>` | O(1) setup, O(n) iteration | +| `partial_jaccard_dist(other: BitSliceView)` | `(a&b).popcount`, `(a\|b).popcount` per word, O(n/64) | +| `jaccard_dist(other: BitSliceView)` | from partial, O(n/64) | +| `hamming_dist(other: BitSliceView)` | `(a^b).popcount` per word, O(n/64) | + +`BitSliceIter<'a>`: word-level scan; one word per 64 iterations. + +### `IntSliceView<'a>` + +```rust +#[derive(Clone, Copy)] +pub struct IntSliceView<'a> { + pub(crate) primary: &'a [u8], + pub(crate) overflow_raw: &'a [u8], // sorted [(slot:u64, value:u32)] entries + pub(crate) n_overflow: usize, + pub(crate) n: usize, +} +``` + +`overflow_raw` contains `n_overflow` entries of `OVERFLOW_ENTRY_SIZE` bytes each, sorted by slot. The sort invariant is established at `close()`/`freeze()` time. + +| Method | Cost | +|---|---| +| `len()`, `is_empty()` | O(1) | +| `primary_bytes()` | O(1) | +| `overflow_entries() -> impl Iterator<(usize,u32)>` | O(n_overflow) iteration | +| `get(slot)` | O(1) primary; binary search O(log k) for overflow slots | +| `iter() -> IntSliceViewIter<'a>` | merge scan, O(n + k) | +| `sum()` | byte scan + overflow, O(n + k) | +| `count_nonzero()` | byte scan, O(n) | +| Distance methods (`bray_dist`, `euclidean_dist`, `jaccard_dist`, …) | O(n + k) | + +`IntSliceViewIter<'a>`: merge scan using `overflow_pos` index. Requires sorted overflow — guaranteed by the construction lifecycle. + +**Builder `view()` vs reader `view()`:** `PersistentCompactIntVecBuilder` stores overflow as an unsorted `HashMap`, not raw bytes. Its `view()` returns an `IntSliceView` with `overflow_raw = &[]` and `n_overflow = 0`. This is intentional — the view is primarily useful after `freeze()`. During building, callers that need overflow use `overflow_entries()` directly. + +--- + +## Concrete types + +```mermaid +classDiagram + class BitSliceView { + +words: &[u64] + +n: usize + +get(slot) bool + +count_ones() u64 + +iter() BitSliceIter + +jaccard_dist/hamming_dist(other: BitSliceView) + } + class IntSliceView { + +primary: &[u8] + +overflow_raw: &[u8] + +n_overflow: usize + +n: usize + +get(slot) u32 + +iter() IntSliceViewIter + +overflow_entries() Iterator + +bray_dist/euclidean_dist/…(other: IntSliceView) + } + class PersistentBitVec { + -mmap: Mmap + -n: usize + +view() BitSliceView + +get(slot) bool + +count_ones/zeros() u64 + +iter() BitIter + +partial_jaccard_dist(&Self) (u64,u64) + +jaccard_dist/hamming_dist(&Self) … + } + class PersistentBitVecBuilder { + -mmap: MmapMut + -n: usize + +view() BitSliceView + +set(slot, bool) + +or/and/xor/not(BitSliceView) + +copy_from(BitSliceView) + +close() / finish() → PersistentBitVec + } + class PersistentCompactIntVec { + -mmap: Mmap + -n: usize + -n_overflow: usize + -step: usize + -index: Vec~(usize,usize)~ + +view() IntSliceView + +get(slot) u32 + +iter() Iter + +sum/count_nonzero() u64 + +bray_dist/euclidean_dist/… (&Self) + } + class PersistentCompactIntVecBuilder { + -mmap: MmapMut + -n: usize + -overflow: HashMap~usize,u32~ + +view() IntSliceView + +set(slot, u32) / get(slot) u32 + +inc / inc_present / inc_present_fast + +inc_predicate / inc_predicate_fast + +add/min/max/diff/mask_with(…View) + +primary_bytes/primary_bytes_mut() + +close() / finish() → PersistentCompactIntVec + } + + PersistentBitVec --> BitSliceView : view() + PersistentBitVecBuilder --> BitSliceView : view() + PersistentCompactIntVec --> IntSliceView : view() + PersistentCompactIntVecBuilder --> IntSliceView : view() (primary only) + PersistentBitVecBuilder --> PersistentBitVec : close() then open() + PersistentCompactIntVecBuilder --> PersistentCompactIntVec : close() then open() +``` + +### `PersistentBitVec` / `PersistentBitVecBuilder` + +`PersistentBitVec` is the read-only type. `view()` returns a `BitSliceView<'_>` over the mmap word array. Direct inherent methods delegate to the view: `count_ones()`, `count_zeros()`, `partial_jaccard_dist(&Self)`, `jaccard_dist(&Self)`, `hamming_dist(&Self)`. + +`BitIter<'a>` — exported iterator for `PersistentBitVec::iter()`: + +```rust +pub struct BitIter<'a> { pub(crate) words: &'a [u64], pub(crate) slot: usize, pub(crate) n: usize } +``` + +`PersistentBitVecBuilder` is the read-write type. Mutation operations accept `BitSliceView<'_>`: + +| Method | Cost | +|---|---| +| `set(slot, bool)` | O(1) | +| `view() -> BitSliceView<'_>` | O(1) | +| `or/and/xor(BitSliceView)` | word-level, O(n/64), SIMD-friendly | +| `not()` | `w ^= u64::MAX` per word, re-masks last word | O(n/64) | +| `copy_from(BitSliceView)` | `copy_from_slice` | O(n/64) | + +### `PersistentCompactIntVec` / `PersistentCompactIntVecBuilder` + +`PersistentCompactIntVec` is the read-only type. `view()` returns an `IntSliceView<'_>` over the mmap primary and overflow arrays. Inherent `iter()` is a merge scan (`Iter` struct). Inherent `sum()` and `count_nonzero()` use fast byte-scan helpers. + +`PersistentCompactIntVecBuilder` is the read-write type. Mutation methods on the builder fall into two categories: + +**Point mutations:** + +| Method | Note | +|---|---| +| `set(slot, u32)` | writes primary[slot] or 255+overflow | +| `get(slot) -> u32` | reads primary byte or HashMap | +| `inc(slot)` | `get` + `set`, O(1) | + +**Bulk computation methods** — accept view arguments: + +| Method | Semantics | Overflow | +|---|---|---| +| `inc_present(BitSliceView)` | `+= 1` at each 1-bit | via `inc`, safe for any group size | +| `inc_present_fast(BitSliceView)` | same, raw u8 `+= 1` | `debug_assert` no 255 reached | +| `inc_predicate(IntSliceView, pred)` | `+= 1` where `pred(col[s])` | two-pass, safe | +| `inc_predicate_fast(IntSliceView, pred)` | same, raw u8 | `debug_assert` no 255 reached | +| `add(IntSliceView)` | `self[s] += other[s]` | primary fast path + overflow fallback | +| `min(IntSliceView)` | byte min + both-overflow fixup | see algorithm below | +| `max(IntSliceView)` | pre-pass + byte max | see algorithm below | +| `diff(IntSliceView)` | saturating sub | self<255 hot path | +| `mask_with(BitSliceView)` | zeros slots where mask bit = 0 | O(n_zeros) | + +**`inc_present_fast` / `inc_predicate_fast` invariant:** caller guarantees no counter reaches 255 during the operation (group size < 255 for `inc_present_fast`, or chunk size < 255 for `inc_predicate_fast`). Violation is caught by `debug_assert` in dev builds. + +**`min` algorithm:** + +Exploits 255 = +∞: byte-level min is correct unless both sides are overflow. + +``` +snapshot self_ov: Vec<(slot,val)> +snapshot other_ov: HashMap +clear_overflow() +Pass 1 — byte min, SIMD-vectorizable, O(n) +Pass 2 — both-overflow fixup, O(k_self): + for (slot, self_val) in self_ov: + if slot ∈ other_ov: set(slot, min(self_val, other_ov[slot])) +``` + +**`max` algorithm:** + +Cannot do byte max first — `max(255, b<255)=255` overwrites self's original overflow value. Pre-pass reads self's value at other's overflow slots before the byte pass. + +``` +Pre-pass O(k_other): for (slot, other_val) in other.overflow_entries(): + set(slot, max(self.get(slot), other_val)) +Pass 1 — byte max, SIMD-vectorizable, O(n) +``` + +--- + +## Matrix types + +Four matrix types, two encodings × two formats: + +| | Columnar format | Packed format | +|---|---|---| +| **Bit** | `PersistentBitMatrix` (Columnar variant) | `PersistentBitMatrix` (Packed variant) | +| **Int** | `PersistentCompactIntMatrix` (Columnar variant) | `PersistentCompactIntMatrix` (Packed variant) | + +Both matrix types are enums (`Columnar` / `Packed` / `Implicit` for bit) behind a transparent API. `col_view(c)` returns the appropriate view directly: + +```rust +// PersistentBitMatrix +pub fn col_view(&self, c: usize) -> BitSliceView<'_> + +// PersistentCompactIntMatrix +pub fn col_view(&self, c: usize) -> IntSliceView<'_> +``` + +No wrapper enums (`BitColView`, `IntColView`): the caller receives a `Copy` view struct immediately usable with any view method or bulk builder method. + +`pack_compact_int_matrix` and `pack_bit_matrix` convert columnar → packed format. + +--- + +## Aggregation traits (matrix level) + +### ColumnWeights + +```rust +trait ColumnWeights: Send + Sync { + fn col_weights(&self) -> Array1; // sum per column + fn partial_kmer_counts(&self) -> Array1; // default = col_weights() +} +``` + +`partial_kmer_counts` is overridden for count matrices to return `count_nonzero` per column (distinct kmers) rather than total count. + +### CountPartials + +Abstract required methods: `partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`, `partial_relfreq_bray`, `partial_relfreq_euclidean`, `partial_hellinger`. + +**Additivity rule:** self-contained partials (`partial_bray`, `partial_euclidean`, `partial_threshold_jaccard`) can be element-wise summed across all `(partition, layer)` pairs. Normalised partials (`partial_relfreq_*`, `partial_hellinger`) require the **global** `col_weights` (accumulated across all layers and all partitions) as parameter. + +**`partial_threshold_jaccard` returns `(inter, union)`** because `union[i,j]` depends on both columns simultaneously. + +Provided finalisations: + +| Finalisation | Formula | +|---|---| +| `bray_dist_matrix()` | `1 − 2·partial_bray[i,j] / (w[i] + w[j])` | +| `euclidean_dist_matrix()` | `√partial_euclidean[i,j]` | +| `threshold_jaccard_dist_matrix(t)` | `1 − inter[i,j] / union[i,j]` | +| `relfreq_bray_dist_matrix()` | `1 − partial_relfreq_bray[i,j]` | +| `relfreq_euclidean_dist_matrix()` | `√partial_relfreq_euclidean[i,j]` | +| `hellinger_dist_matrix()` | `√partial_hellinger[i,j] / √2` | +| `hellinger_euclidean_dist_matrix()` | `√partial_hellinger[i,j]` | + +### BitPartials + +Required: `partial_jaccard() -> (Array2, Array2)`, `partial_hamming() -> Array2`. Both additive across layers and partitions. + +--- + +## Temp-file-backed types + +**All inter-function results use temp-file-backed types** so the OS can page them out under memory pressure. This matters in practice: processing dozens of layers × hundreds of partitions in parallel would otherwise accumulate gigabytes of live anonymous memory. + +### Lifecycle + +``` +TempCompactIntVecBuilder::new(n) → writable mmap in TempDir + ↓ (inc_present_fast / inc_predicate_fast / add / mask_with / …) + .freeze() → TempCompactIntVec (read-only mmap + TempDir) + ↓ (optional) + .make_persistent(path) → PersistentCompactIntVec (permanent file) +``` + +Same pattern for `TempBitVecBuilder` → `TempBitVec` → `PersistentBitVec`. + +**Drop order**: `TempCompactIntVec { vec: PersistentCompactIntVec, _temp: TempDir }` — Rust drops fields in declaration order. `vec` (mmap) released before `_temp` (directory deleted). No explicit `drop()` needed. + +### TempCompactIntVec / TempCompactIntVecBuilder + +```rust +pub struct TempCompactIntVec { + vec: PersistentCompactIntVec, + _temp: TempDir, // dropped after vec +} + +pub(crate) struct TempCompactIntVecBuilder { + builder: PersistentCompactIntVecBuilder, + temp: TempDir, +} +``` + +`TempCompactIntVec`: read access via `get(slot)`, `sum()`, `iter()`, `view() -> IntSliceView<'_>`. + +`TempCompactIntVecBuilder`: full delegation to inner `PersistentCompactIntVecBuilder` — all bulk computation methods (`inc_present_fast`, `inc_predicate_fast`, `add`, `min`, `max`, `diff`, `mask_with`) are exposed as `pub(crate)`. + +### TempBitVec / TempBitVecBuilder + +```rust +pub struct TempBitVec { + vec: PersistentBitVec, + _temp: TempDir, +} + +pub(crate) struct TempBitVecBuilder { + builder: PersistentBitVecBuilder, + temp: TempDir, +} +``` + +`TempBitVec`: read access via `get(slot)`, `count_ones()`, `view() -> BitSliceView<'_>`, `iter()`. + +`TempBitVecBuilder`: exposes `set(slot, bool)`, `or(BitSliceView)`, and: + +```rust +pub(crate) fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) +``` + +`or_where` — two passes, no intermediate allocation: + +``` +Pass 1 — primary bytes, O(n): + for slot in 0..n: + b = col.primary_bytes()[slot] + if b < 255 AND pred(b as u32): self.set(slot, true) + +Pass 2 — overflow, O(k): + for (slot, val) in col.overflow_entries(): + if pred(val): self.set(slot, true) +``` + +--- + +## Filter / Select API + +### ColGroup + +```rust +pub struct ColGroup { pub name: String, pub indices: Vec } +``` + +Defined **once at the index level** from column metadata. Valid in all matrices of all layers and partitions — column structure is identical across the entire hierarchy; only rows (kmer slots) are partitioned. + +### Composition axis + +- **Across partitions**: kmer space is partitioned → partial results **concatenated** (disjoint kmer ranges). +- **Across layers**: same kmer space, different counts → partial results **aggregated** (add, OR, etc.). + +### MatrixGroupOps + +Five required primitives + two default methods derived from them. All return temp-file-backed types. + +```rust +pub trait MatrixGroupOps { + // required + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) + -> io::Result; + fn partial_group_sum(&self, g: &ColGroup) + -> io::Result; + fn partial_group_any(&self, g: &ColGroup, threshold: u32) + -> io::Result; + fn partial_group_min(&self, g: &ColGroup) + -> io::Result; + fn partial_group_max(&self, g: &ColGroup) + -> io::Result; + + // defaults derived from partial_group_presence_count + fn partial_group_all(&self, g: &ColGroup, threshold: u32) + -> io::Result; // slot=1 iff count == g.indices.len() + fn partial_group_none(&self, g: &ColGroup, threshold: u32) + -> io::Result; // slot=1 iff count == 0 +} +``` + +Implemented for both `PersistentCompactIntMatrix` and `PersistentBitMatrix`. + +For **bit matrices**: values are 0/1, so `partial_group_sum` = `partial_group_presence_count(g, 1)`; `partial_group_min` is AND (set first column then mask-with remaining); `partial_group_max` is OR via `partial_group_any` + `inc_present`. + +**`partial_group_presence_count` — chunking for large groups:** + +When `g.indices.len() < 255`: per-slot counts stay within `u8` range. Use `inc_present_fast` (bit) or `inc_predicate_fast(col_view(c), |v| v >= threshold)` (int) — raw u8 increment, no overflow entry written. + +When `g.indices.len() ≥ 255`: process in chunks of 254 columns, accumulate via `.add(chunk_frozen.view())`. + +**`partial_group_min` (int matrix)**: copy first column via `.add(col_view(first))` (start from 0 ⇒ copy), then `.min(col_view(c))` for remaining. + +**`partial_group_max` (int matrix)**: `.max(col_view(c))` for all columns (start from 0 ⇒ first column acts as copy). + +**`partial_group_any`** uses `or_where` on `TempBitVecBuilder` (two-pass: primary bytes then overflow entries). + +**`partial_group_all` / `partial_group_none`** (default): call `partial_group_presence_count`, then iterate slots to produce the bit result. O(n) extra pass, not chunked. + +### add_col_from — matrix builder integration + +Both matrix builders accept temp-file results directly: + +```rust +// PersistentBitMatrixBuilder +fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> +fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> // nonzero → 1 + +// PersistentCompactIntMatrixBuilder +fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> +fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> // bit → 0/1 u32 +``` + +`add_col_from` copies the temp file to the matrix directory and increments `n_cols`; `close()` writes `meta.json` with the final column count. No separate `write_meta` step needed. + +### mask_with + +Direct method on `PersistentCompactIntVecBuilder` (and delegation via `TempCompactIntVecBuilder`). Zeros every slot where the corresponding mask bit is 0. Iterates only zero bits — O(n_zeros), O(1) when mask is all-ones. + +``` +for (w_idx, word) in mask.words(): + if word == u64::MAX: continue // skip all-ones words + zeros = !word + while zeros != 0: + bit = trailing_zeros(zeros) + s = w_idx * 64 + bit + if primary[s] != 0: set(s, 0) // clears overflow entry too + zeros &= zeros − 1 +``` + +Terminal operation for Filter (retain only selected kmer slots in a count vector) and Select (positional selection without MPHF). diff --git a/docmd/implementation/obitaxonomy.md b/docmd/implementation/obitaxonomy.md new file mode 100644 index 0000000..d8ccd22 --- /dev/null +++ b/docmd/implementation/obitaxonomy.md @@ -0,0 +1,143 @@ +# `obitaxonomy` — taxonomy concept paths + +`obitaxonomy` is a dependency-free crate that defines a typed representation +of hierarchical concept paths (taxonomic or otherwise) stored in genome metadata. + +--- + +## Concept path syntax + +A concept path is stored as a metadata value with the prefix `taxonomy:/`: + +``` +taxonomy:/enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species +``` + +Structure: + +- The `taxonomy:/` prefix is the type discriminator. Any metadata value starting + with it is parsed as a `TaxPath`; all others remain plain strings. +- The remainder is one or more `/`-separated segments. +- Each segment is `name` or `name@rank`, where `rank` is a label for the + taxonomic level (e.g. `family`, `genus`, `species`). +- Rank annotations are **optional per segment** and can be mixed freely. +- Spaces are allowed in both names and ranks. + +### Reserved character + +`@` is reserved throughout the taxonomy system and may **not** appear in: + +| Context | Constraint | +|---------|------------| +| Segment name | forbidden | +| Rank/class label | forbidden | +| Metadata key names | forbidden (used as `key@rank` in predicate syntax) | + +`@` is freely allowed in plain-text metadata values (non-taxonomy). + +### Parse errors + +| Condition | Error | +|-----------|-------| +| Value does not start with `taxonomy:/` | `MissingPrefix` | +| No segments after the prefix | `EmptyPath` | +| Segment with empty name (consecutive `/`) | `EmptySegmentName` | +| Segment with trailing `@` and no rank (`name@`) | `EmptyRankName` | +| Segment with more than one `@` | `AmbiguousRank` | + +--- + +## Public API + +### `TaxSegment` + +A single node: a name and an optional rank. + +```rust +seg.name() // &str +seg.rank() // Option<&str> +seg.to_string() // "name" or "name@rank" +TaxSegment::parse(s) // Result +``` + +### `TaxPath` + +```rust +TaxPath::parse(s) // Result +path.segments() // &[TaxSegment] +path.depth() // usize — number of segments +path.is_ancestor_of(&other) // bool — prefix match by name, ranks ignored +path.name_at_rank("genus") // Option<&str> +path.to_string() // reconstructs "taxonomy:/…" +``` + +`is_ancestor_of` compares segment **names** only — rank annotations are +informational and do not affect the ancestry relation. + +```rust +let a: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus".parse()?; +let b: TaxPath = "taxonomy:/Enterobacteriaceae@family/Escherichia@genus/Escherichia coli@species".parse()?; + +assert!(a.is_ancestor_of(&b)); // true +assert!(b.is_ancestor_of(&a)); // false +assert!(a.is_ancestor_of(&a)); // true (equal ⇒ ancestor) + +assert_eq!(b.name_at_rank("species"), Some("Escherichia coli")); +assert_eq!(b.name_at_rank("genus"), Some("Escherichia")); +assert_eq!(b.name_at_rank("order"), None); +``` + +--- + +## Integration with `GenomeInfo` + +At index load time, every metadata value is inspected once: + +- Starts with `taxonomy:/` → parsed into `TaxPath`, stored in `genome.taxonomy`. +- Otherwise → kept as-is in `genome.meta`. + +```rust +struct GenomeInfo { + label: String, + meta: HashMap, // plain text metadata + taxonomy: HashMap, // parsed taxonomy metadata +} +``` + +The raw string is not duplicated. `TaxPath::to_string()` reconstructs the +original value losslessly for serialisation. + +--- + +## Predicate operators (in `filter` / `select`) + +Path predicates use the `~` / `!~` operators. The **stored value** always starts +with `/` (rooted path); the **query pattern** does not need to. + +### Path pattern syntax + +| Pattern | Semantics | +|---------|-----------| +| `A/B` | contiguous sub-path A then B, anywhere in the value | +| `/A/B` | value starts with A then B (start-anchored) | +| `A/B$` | value ends with A then B (end-anchored) | +| `/A/B$` | value is exactly A then B (fully anchored) | +| `A@x/B` | A with class `x` followed by B with any class | +| `A@x/B@y` | A with class `x` followed by B with class `y` | + +A segment pattern without `@` matches the segment name regardless of its stored class. + +### Rank-aware queries + +``` +key@rank=value +``` + +| Predicate form | Semantics | +|----------------|-----------| +| `key@rank=value` | genome's `key` has `value` at rank `rank` | +| `key@rank!=value` | does not | +| `key@rank=v1\|v2` | value at `rank` is `v1` or `v2` | + +`~` combined with `@rank` on the key (e.g. `key@genus~pattern`) is not defined +and is rejected at parse time. diff --git a/docmd/installation.md b/docmd/installation.md index d9a5cda..ab4b934 100644 --- a/docmd/installation.md +++ b/docmd/installation.md @@ -60,13 +60,13 @@ HPC home directories are typically on a network filesystem (Lustre, NFS) optimis **Always redirect the build directory to a local scratch disk:** ```bash -CARGO_TARGET_DIR=/scratch/local/$USER/cargo-target cargo build --release +CARGO_TARGET_DIR=/scratch/$USER/cargo-target cargo build --release ``` Adapt the path to the local scratch available on your cluster (`/var/tmp`, `/tmp`, `/scratch/local`, etc.). Once built, copy the binary to a permanent location: ```bash -cp /scratch/local/$USER/cargo-target/release/obikmer ~/bin/ +cp /scratch/$USER/cargo-target/release/obikmer ~/bin/ ``` ## NUMA support diff --git a/mkdocs.yml b/mkdocs.yml index c27d1a9..7973e78 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -53,6 +53,7 @@ nav: - Merge parallelism & memory: implementation/merge_parallelism.md - Kmer filtering: implementation/filtering.md - Select command: implementation/select.md + - obitaxonomy crate: implementation/obitaxonomy.md - Architecture: - Sequences: architecture/sequences/invariant.md - Kmer index: architecture/index_architecture.md diff --git a/obicompactvector_reflexion.md b/obicompactvector_reflexion.md new file mode 100644 index 0000000..a8e2356 --- /dev/null +++ b/obicompactvector_reflexion.md @@ -0,0 +1,44 @@ +# La crate obicompactvector + +Le code actuelle est ce qu'il est. Ce n'est pad la vrérité absolue, c'est un premier effort d'implémentation rien de plus. Ci-dessous je vais décrire les objectif et la structure qui devrait être. LA VERITE A ATTEINDRE. + +La crate fournie des représentations les plus compact possible en mémoire de matrice de comptage ou de présence de k-mer dans des génomes. Chaque colonne représente un génome chaque ligne un kmer. une matrice est une collection de vecteur ou chacun des vecteur est un colonne de la matrice. + +Les matrices comme les colonnes ont vocation à être persistante. Les données sont stockées dans des fichiers binaires. Les données sont mappées en mémoire via `mmap` + +Les structure sont par essence immutables. Il existe des représentations mutables des colonnes qui permettent leur construction. À la fin de leur construction, les colonnes sont fermée ce qui les rends immutable. + +Les matrices peuvent êtres représenté de deux façons: + - via un répertoire contenant une collection de fichier colonnes + - via un fichier matrix qui est la concatenation de plusieurs fichiers colonnes. + + +## Les matrices de comptage + +Ce sont des matrice d'entiers positif la plus part du temps de petites valeurs (inferieurs à 255). On assume que toutes les valeurs sont représentables sur un `u32` + +## Les matrices de presence + +Ce sont des matrices de boolean représenté comme des champs de bits + +Il existe une forme implicite des vecteur de présence, qui n'est représenté par aucun fichier pour lequel toutes les valeurs sont vraies + +## représentation légère des colonnes + +Les colonnes qu'elles soient de unitiaire (fichier colonne) ou partie d'un fichier composite matrice peuvent être représenté par un objet léger donnant acces à ces valeurs ainsi qu'à la longeur du vecteurs. Toutes les méthodes de calcules doivent uniquement travailler à partir de ces représentations légère unifiées des colonnes. + +### Représentation légère d'un vecteur de présence + +Le vecteur est représenté par + - un champs de bits encodé comme un [u64] + - un usize encodant la longeur du champs de bits + +### Représentation légère d'un vecteur de présence + +Le vecteur est représenté par + - un vecteur [u8] encodant directement les valeur faibe du vecteur [0,255[ + La valeur 255 est une valeur sentinelle indiquant que la valeure vraie est >=255 + et se trouvent dans une structure d'overflow + - un iterateur de (usize,u32) listant les valeurs d'overflow coorespondant aux valeurs + sentinels (255) du [u8] + - un usize encodant la longeur du champs de bits diff --git a/src/Cargo.lock b/src/Cargo.lock index 2983231..4c59927 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1704,7 +1704,7 @@ dependencies = [ [[package]] name = "obikmer" -version = "0.1.0" +version = "0.1.3" dependencies = [ "clap", "csv", @@ -1722,6 +1722,7 @@ dependencies = [ "obiskbuilder", "obiskio", "obisys", + "obitaxonomy", "pprof", "rayon", "serde_json", @@ -1853,6 +1854,10 @@ dependencies = [ "tracing", ] +[[package]] +name = "obitaxonomy" +version = "0.1.0" + [[package]] name = "object" version = "0.37.3" diff --git a/src/Cargo.toml b/src/Cargo.toml index 46a4f87..141df02 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -1,5 +1,5 @@ [workspace] resolver = "3" -members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"] +members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"] [profile.release] debug = 1 diff --git a/src/obicompactvec/Cargo.toml b/src/obicompactvec/Cargo.toml index ddb1e40..777b606 100644 --- a/src/obicompactvec/Cargo.toml +++ b/src/obicompactvec/Cargo.toml @@ -7,6 +7,6 @@ edition = "2024" memmap2 = "0.9" ndarray = "0.16" rayon = "1" +tempfile = "3" [dev-dependencies] -tempfile = "3" diff --git a/src/obicompactvec/src/bitmatrix.rs b/src/obicompactvec/src/bitmatrix.rs index ca1b393..72f8b05 100644 --- a/src/obicompactvec/src/bitmatrix.rs +++ b/src/obicompactvec/src/bitmatrix.rs @@ -7,8 +7,12 @@ use ndarray::{Array1, Array2}; use rayon::prelude::*; use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; +use crate::colgroup::{ColGroup, MatrixGroupOps}; use crate::layer_meta::LayerMeta; use crate::meta::MatrixMeta; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; +use crate::views::BitSliceView; fn col_path(dir: &Path, col: usize) -> PathBuf { dir.join(format!("col_{col:06}.pbiv")) @@ -54,34 +58,11 @@ impl ColumnarBitMatrix { } pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { - let (inter, union) = self.col(i).partial_jaccard_dist(self.col(j)); - (i, j, inter, union) - }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_jaccard_dist(self.col(j))) } pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.col(i).hamming_dist(self.col(j))) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, f(i, j))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).hamming_dist(self.col(j))) } pub(crate) fn append_column(dir: &Path, value_of: impl Fn(usize) -> bool) -> io::Result<()> { @@ -147,84 +128,46 @@ impl PackedBitMatrix { }).collect() } - #[inline] fn col_bytes(&self, c: usize) -> &[u8] { let start = self.data_offsets[c]; - let len = (self.n_rows + 7) / 8; - &self.mmap[start..start + len] + &self.mmap[start..start + self.n_rows.div_ceil(8)] } - fn count_ones_col(&self, c: usize) -> u64 { - let bytes = self.col_bytes(c); - let full = self.n_rows / 8; - let rem = self.n_rows % 8; - let mut n: u64 = bytes[..full].iter().map(|b| b.count_ones() as u64).sum(); - if rem > 0 { n += (bytes[full] & ((1u8 << rem) - 1)).count_ones() as u64; } - n + fn col_words(&self, c: usize) -> &[u64] { + let nw = self.n_rows.div_ceil(64); + // SAFETY: data_offsets[c] is always 8-byte aligned. + // PBMX header = 24 + n_cols×8 (multiple of 8); each PBIV blob = + // 16 + nwords×8 (multiple of 8); mmap base is page-aligned. + let ptr = self.mmap[self.data_offsets[c]..].as_ptr() as *const u64; + unsafe { std::slice::from_raw_parts(ptr, nw) } } - fn pair_op(&self, i: usize, j: usize, and_or: bool) -> u64 { - let ai = self.col_bytes(i); - let aj = self.col_bytes(j); - let full = self.n_rows / 8; - let rem = self.n_rows % 8; - let mut n: u64 = ai[..full].iter().zip(aj[..full].iter()) - .map(|(a, b)| if and_or { a & b } else { a ^ b }.count_ones() as u64) - .sum(); - if rem > 0 { - let mask = (1u8 << rem) - 1; - let last = if and_or { ai[full] & aj[full] } else { ai[full] ^ aj[full] }; - n += (last & mask).count_ones() as u64; - } - n + pub(crate) fn col_slice(&self, c: usize) -> BitSliceView<'_> { + BitSliceView::new(self.col_words(c), self.n_rows) } - fn partial_jaccard_col(&self, i: usize, j: usize) -> (u64, u64) { - let ai = self.col_bytes(i); - let aj = self.col_bytes(j); - let full = self.n_rows / 8; - let rem = self.n_rows % 8; - let (mut inter, mut union) = ai[..full].iter().zip(aj[..full].iter()) - .fold((0u64, 0u64), |(inter, union), (a, b)| { - (inter + (a & b).count_ones() as u64, - union + (a | b).count_ones() as u64) - }); - if rem > 0 { - let mask = (1u8 << rem) - 1; - inter += ((ai[full] & aj[full]) & mask).count_ones() as u64; - union += ((ai[full] | aj[full]) & mask).count_ones() as u64; - } - (inter, union) + pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { + PersistentBitVecBuilder::from_raw_bytes(self.col_bytes(c), self.n_rows, path) } pub(crate) fn count_ones(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter().map(|c| self.count_ones_col(c)).collect() + (0..self.n_cols).into_par_iter() + .map(|c| self.col_slice(c).count_ones()) + .collect() ) } pub(crate) fn partial_jaccard_dist_matrix(&self) -> (Array2, Array2) { - let n = self.n_cols; - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { let (inter, union) = self.partial_jaccard_col(i, j); (i, j, inter, union) }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols, |i, j| { + self.col_slice(i).partial_jaccard_dist(self.col_slice(j)) + }) } pub(crate) fn partial_hamming_dist_matrix(&self) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| (i, j, self.pair_op(i, j, false))) - .collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + pairwise_matrix(self.n_cols, |i, j| { + self.col_slice(i).hamming_dist(self.col_slice(j)) + }) } } @@ -343,6 +286,24 @@ impl PersistentBitMatrix { } } + pub fn col_view(&self, c: usize) -> BitSliceView<'_> { + match self { + Self::Columnar(m) => m.col(c).view(), + Self::Packed(m) => m.col_slice(c), + Self::Implicit { .. } => panic!("col_view() not available on Implicit PersistentBitMatrix"), + } + } + + pub fn col_persist(&self, c: usize, path: &Path) -> io::Result { + match self { + Self::Columnar(m) => PersistentBitVecBuilder::build_from(m.col(c), path), + Self::Packed(m) => m.col_persist(c, path), + Self::Implicit { n_rows, .. } => { + PersistentBitVecBuilder::new_ones(*n_rows, path) + } + } + } + pub fn row(&self, slot: usize) -> Box<[bool]> { match self { Self::Columnar(m) => m.row(slot), @@ -439,12 +400,93 @@ impl PersistentBitMatrixBuilder { PersistentBitVecBuilder::new(self.n, &path) } + pub fn add_col_ones(&mut self) -> io::Result { + let path = col_path(&self.dir, self.n_cols); + self.n_cols += 1; + PersistentBitVecBuilder::new_ones(self.n, &path) + } + + pub fn add_col_from(&mut self, src: &TempBitVec) -> io::Result<()> { + src.make_persistent(&col_path(&self.dir, self.n_cols))?; + self.n_cols += 1; + Ok(()) + } + + pub fn add_col_from_int(&mut self, src: &TempCompactIntVec) -> io::Result<()> { + let path = col_path(&self.dir, self.n_cols); + self.n_cols += 1; + let mut b = PersistentBitVecBuilder::new(self.n, &path)?; + b.or_where(src.view(), |v| v > 0); + b.close() + } + pub fn close(self) -> io::Result<()> { MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) } } -// ── Helpers ─────────────────────────────────────────────────────────────────── +// ── MatrixGroupOps ──────────────────────────────────────────────────────────── + +impl MatrixGroupOps for PersistentBitMatrix { + fn partial_group_presence_count(&self, g: &ColGroup, _threshold: u32) -> io::Result { + // Bit matrices store 0/1 — threshold is structurally always 1. + let n = self.n(); + if g.indices.len() < 255 { + let mut builder = TempCompactIntVecBuilder::new(n)?; + for &c in &g.indices { + builder.inc_present_fast(self.col_view(c)); + } + builder.freeze() + } else { + let mut result = TempCompactIntVecBuilder::new(n)?; + for chunk in g.indices.chunks(254) { + let mut chunk_b = TempCompactIntVecBuilder::new(n)?; + for &c in chunk { + chunk_b.inc_present_fast(self.col_view(c)); + } + let frozen = chunk_b.freeze()?; + result.add(frozen.view()); + } + result.freeze() + } + } + + fn partial_group_sum(&self, g: &ColGroup) -> io::Result { + // For bit matrices, sum = count of 1-bits — identical to presence_count. + self.partial_group_presence_count(g, 1) + } + + fn partial_group_any(&self, g: &ColGroup, _threshold: u32) -> io::Result { + let n = self.n(); + let mut result = TempBitVecBuilder::new(n)?; + for &c in &g.indices { + result.or(self.col_view(c)); + } + result.freeze() + } + + fn partial_group_min(&self, g: &ColGroup) -> io::Result { + // min of 0/1 values = AND: 1 only if ALL columns are 1 + let n = self.n(); + let mut result = TempCompactIntVecBuilder::new(n)?; + if let Some((&first, rest)) = g.indices.split_first() { + result.inc_present_fast(self.col_view(first)); + for &c in rest { result.mask_with(self.col_view(c)); } + } + result.freeze() + } + + fn partial_group_max(&self, g: &ColGroup) -> io::Result { + // max of 0/1 values = OR: 1 if any column is 1 + let any = self.partial_group_any(g, 1)?; + let n = any.len(); + let mut result = TempCompactIntVecBuilder::new(n)?; + result.inc_present(any.view()); + result.freeze() + } +} + +// ── Shared matrix helpers (also used by intmatrix.rs) ───────────────────────── fn upper_pairs(n: usize) -> Vec<(usize, usize)> { (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() @@ -456,3 +498,30 @@ where T: Clone + Default { for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; } m } + +/// Compute a symmetric `n×n` matrix in parallel by evaluating `f(i,j)` for +/// all upper-triangle pairs. `T: Copy` avoids the `.clone()` needed for the +/// lower-triangle mirror. +pub(crate) fn pairwise_matrix(n: usize, f: impl Fn(usize, usize) -> T + Sync) -> Array2 +where T: Copy + Default + Send { + let results: Vec<(usize, usize, T)> = upper_pairs(n) + .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); + fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) +} + +/// Same as `pairwise_matrix` but `f` returns two values that fill two +/// symmetric matrices simultaneously (e.g. intersection + union for Jaccard). +pub(crate) fn pairwise2_matrix(n: usize, f: impl Fn(usize, usize) -> (T, T) + Sync) -> (Array2, Array2) +where T: Copy + Default + Send { + let results: Vec<(usize, usize, T, T)> = upper_pairs(n) + .into_par_iter() + .map(|(i, j)| { let (a, b) = f(i, j); (i, j, a, b) }) + .collect(); + let mut m0 = Array2::from_elem((n, n), T::default()); + let mut m1 = Array2::from_elem((n, n), T::default()); + for (i, j, a, b) in results { + m0[[i, j]] = a; m0[[j, i]] = a; + m1[[i, j]] = b; m1[[j, i]] = b; + } + (m0, m1) +} diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs index cfc26aa..ee7d6f7 100644 --- a/src/obicompactvec/src/bitvec.rs +++ b/src/obicompactvec/src/bitvec.rs @@ -5,29 +5,25 @@ use std::path::{Path, PathBuf}; use memmap2::{Mmap, MmapMut}; use crate::reader::PersistentCompactIntVec; +use crate::views::{BitSliceIter, BitSliceView, IntSliceView}; const MAGIC: [u8; 4] = *b"PBIV"; // Header: magic(4) + _pad(4) + n(8) = 16 bytes. -// Data starts at offset 16, which is divisible by 8 → u64-aligned -// (mmap base is page-aligned, 16 % 8 == 0). +// Data starts at offset 16, u64-aligned (mmap base is page-aligned, 16 % 8 == 0). const HEADER_SIZE: usize = 16; #[inline] -fn n_words(n: usize) -> usize { - n.div_ceil(64) -} +pub(crate) fn n_words(n: usize) -> usize { n.div_ceil(64) } #[inline] -fn n_bytes_for_words(n: usize) -> usize { - n_words(n) * 8 -} +fn n_bytes_for_words(n: usize) -> usize { n_words(n) * 8 } -// ── Reader ──────────────────────────────────────────────────────────────────── +// ── PersistentBitVec ────────────────────────────────────────────────────────── pub struct PersistentBitVec { mmap: Mmap, - n: usize, + n: usize, path: PathBuf, } @@ -35,157 +31,145 @@ impl PersistentBitVec { pub fn open(path: &Path) -> io::Result { let mmap = unsafe { Mmap::map(&File::open(path)?)? }; if mmap.len() < HEADER_SIZE { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "PBIV file too short", - )); + return Err(io::Error::new(io::ErrorKind::InvalidData, "PBIV file too short")); } if &mmap[0..4] != &MAGIC { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PBIV magic")); } let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; - Ok(Self { - mmap, - n, - path: path.to_path_buf(), - }) + Ok(Self { mmap, n, path: path.to_path_buf() }) } - pub fn path(&self) -> &Path { - &self.path - } - pub fn len(&self) -> usize { - self.n - } - pub fn is_empty(&self) -> bool { - self.n == 0 - } + pub fn path(&self) -> &Path { &self.path } + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } pub fn get(&self, slot: usize) -> bool { (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 } - // Used by iter() and get(): exact byte window, no padding. - fn data_bytes(&self) -> &[u8] { - &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n.div_ceil(8)] - } - - // Bulk word view. SAFETY: mmap is page-aligned, HEADER_SIZE=16 is divisible by 8, - // so &mmap[HEADER_SIZE] is u64-aligned. Slice length is n_words * 8 bytes. + // SAFETY: mmap is page-aligned, HEADER_SIZE=16 divisible by 8 → u64-aligned. fn data_words(&self) -> &[u64] { - let nw = n_words(self.n); + let nw = n_words(self.n); let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; unsafe { std::slice::from_raw_parts(ptr, nw) } } - pub fn count_ones(&self) -> u64 { - // Padding bits in the last word are 0, so no masking needed. - self.data_words() - .iter() - .map(|w| w.count_ones() as u64) - .sum() + pub fn view(&self) -> BitSliceView<'_> { + BitSliceView::new(self.data_words(), self.n) } - pub fn count_zeros(&self) -> u64 { - self.n as u64 - self.count_ones() - } + pub fn words(&self) -> &[u64] { self.data_words() } - pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 { - let (inter, union) = self.partial_jaccard_dist(other); - if union == 0 { - return 0.0; - } - 1.0 - inter as f64 / union as f64 - } + pub fn count_ones(&self) -> u64 { self.view().count_ones() } + pub fn count_zeros(&self) -> u64 { self.view().count_zeros() } pub fn partial_jaccard_dist(&self, other: &PersistentBitVec) -> (u64, u64) { - assert_eq!(self.n, other.n, "length mismatch"); - self.data_words() - .iter() - .zip(other.data_words()) - .fold((0u64, 0u64), |(i, u), (&a, &b)| { - ( - i + (a & b).count_ones() as u64, - u + (a | b).count_ones() as u64, - ) - }) + self.view().partial_jaccard_dist(other.view()) + } + pub fn jaccard_dist(&self, other: &PersistentBitVec) -> f64 { + self.view().jaccard_dist(other.view()) } - pub fn hamming_dist(&self, other: &PersistentBitVec) -> u64 { - assert_eq!(self.n, other.n, "length mismatch"); - self.data_words() - .iter() - .zip(other.data_words()) - .map(|(&a, &b)| (a ^ b).count_ones() as u64) - .sum() + self.view().hamming_dist(other.view()) } pub fn iter(&self) -> BitIter<'_> { - BitIter { - bytes: self.data_bytes(), - slot: 0, - n: self.n, - } + BitIter { words: self.data_words(), slot: 0, n: self.n } } } impl<'a> IntoIterator for &'a PersistentBitVec { type Item = bool; type IntoIter = BitIter<'a>; - fn into_iter(self) -> BitIter<'a> { - self.iter() - } + fn into_iter(self) -> BitIter<'a> { self.iter() } } +// ── BitIter ─────────────────────────────────────────────────────────────────── + pub struct BitIter<'a> { - bytes: &'a [u8], - slot: usize, - n: usize, + words: &'a [u64], + slot: usize, + n: usize, } impl ExactSizeIterator for BitIter<'_> {} impl Iterator for BitIter<'_> { type Item = bool; - fn next(&mut self) -> Option { - if self.slot >= self.n { - return None; - } - let v = (self.bytes[self.slot >> 3] >> (self.slot & 7)) & 1 != 0; + if self.slot >= self.n { return None; } + let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0; self.slot += 1; Some(v) } - fn size_hint(&self) -> (usize, Option) { let rem = self.n - self.slot; (rem, Some(rem)) } } -// ── Builder ─────────────────────────────────────────────────────────────────── +// ── PersistentBitVecBuilder ─────────────────────────────────────────────────── pub struct PersistentBitVecBuilder { mmap: MmapMut, - n: usize, + n: usize, + path: PathBuf, } impl PersistentBitVecBuilder { pub fn new(n: usize, path: &Path) -> io::Result { let file_size = HEADER_SIZE + n_bytes_for_words(n); let mut file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) + .read(true).write(true).create(true).truncate(true) .open(path)?; file.write_all(&MAGIC)?; - file.write_all(&[0u8; 4])?; // padding + file.write_all(&[0u8; 4])?; file.write_all(&(n as u64).to_le_bytes())?; file.seek(SeekFrom::Start(0))?; file.set_len(file_size as u64)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - Ok(Self { mmap, n }) + Ok(Self { mmap, n, path: path.to_path_buf() }) + } + + pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result { + let file_size = HEADER_SIZE + n_bytes_for_words(n); + let file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.set_len(file_size as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + mmap[0..4].copy_from_slice(&MAGIC); + mmap[8..16].copy_from_slice(&(n as u64).to_le_bytes()); + mmap[HEADER_SIZE..HEADER_SIZE + bytes.len()].copy_from_slice(bytes); + Ok(Self { mmap, n, path: path.to_path_buf() }) + } + + /// Create an all-ones bit vector of length `n` at `path`. + /// + /// More efficient than `new(n, path)` + `not()`: the data is written as + /// 0xFF bytes in a single sequential pass, with no intermediate all-zeros state. + pub fn new_ones(n: usize, path: &Path) -> io::Result { + let nw = n_words(n); + let file_size = HEADER_SIZE + nw * 8; + let mut file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.write_all(&MAGIC)?; + file.write_all(&[0u8; 4])?; + file.write_all(&(n as u64).to_le_bytes())?; + file.write_all(&vec![0xFFu8; nw * 8])?; + file.seek(SeekFrom::Start(0))?; + file.set_len(file_size as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + // Clear padding bits in the last word so trailing bits are always 0. + let rem = n % 64; + if rem != 0 { + let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; + let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) }; + words[nw - 1] &= (1u64 << rem) - 1; + } + Ok(Self { mmap, n, path: path.to_path_buf() }) } pub fn build_from(source: &PersistentBitVec, path: &Path) -> io::Result { @@ -193,86 +177,14 @@ impl PersistentBitVecBuilder { let file = OpenOptions::new().read(true).write(true).open(path)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; let n = source.len(); - Ok(Self { mmap, n }) + Ok(Self { mmap, n, path: path.to_path_buf() }) } - pub fn len(&self) -> usize { - self.n - } - pub fn is_empty(&self) -> bool { - self.n == 0 - } - - pub fn get(&self, slot: usize) -> bool { - (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 - } - - pub fn set(&mut self, slot: usize, value: bool) { - let byte = HEADER_SIZE + (slot >> 3); - let bit = 1u8 << (slot & 7); - if value { - self.mmap[byte] |= bit; - } else { - self.mmap[byte] &= !bit; - } - } - - // SAFETY: same alignment argument as PersistentBitVec::data_words. - fn data_words_mut(&mut self) -> &mut [u64] { - let nw = n_words(self.n); - let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; - unsafe { std::slice::from_raw_parts_mut(ptr, nw) } - } - - pub fn and(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw &= ow; - } - } - - pub fn or(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw |= ow; - } - } - - pub fn xor(&mut self, other: &PersistentBitVec) { - assert_eq!(self.n, other.n, "length mismatch"); - for (sw, &ow) in self.data_words_mut().iter_mut().zip(other.data_words()) { - *sw ^= ow; - } - } - - pub fn not(&mut self) { - let rem = self.n % 64; - let words = self.data_words_mut(); - for w in words.iter_mut() { - *w ^= u64::MAX; - } - // Zero padding bits in the last word so count_ones / jaccard remain correct. - if rem != 0 { - if let Some(last) = words.last_mut() { - *last &= (1u64 << rem) - 1; - } - } - } - - /// Convert a count vector to a bit vector: bit set iff count >= threshold. - /// Fills u64 words directly from the count iterator — O(n), no bit-level set() overhead. - pub fn build_from_counts( - source: &PersistentCompactIntVec, - threshold: u32, - path: &Path, - ) -> io::Result { + pub fn build_from_counts(source: &PersistentCompactIntVec, threshold: u32, path: &Path) -> io::Result { let n = source.len(); let file_size = HEADER_SIZE + n_bytes_for_words(n); let mut file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) + .read(true).write(true).create(true).truncate(true) .open(path)?; file.write_all(&MAGIC)?; file.write_all(&[0u8; 4])?; @@ -280,27 +192,157 @@ impl PersistentBitVecBuilder { file.seek(SeekFrom::Start(0))?; file.set_len(file_size as u64)?; let mut mmap = unsafe { MmapMut::map_mut(&file)? }; - { - let nw = n_words(n); + let nw = n_words(n); let ptr = mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; let words = unsafe { std::slice::from_raw_parts_mut(ptr, nw) }; for (slot, count) in source.iter().enumerate() { - if count >= threshold { - words[slot >> 6] |= 1u64 << (slot & 63); - } + if count >= threshold { words[slot >> 6] |= 1u64 << (slot & 63); } } } - - Ok(Self { mmap, n }) + Ok(Self { mmap, n, path: path.to_path_buf() }) } - /// Convert a count vector to a presence/absence bit vector (threshold = 1). pub fn build_from_presence(source: &PersistentCompactIntVec, path: &Path) -> io::Result { Self::build_from_counts(source, 1, path) } - pub fn close(self) -> io::Result<()> { - self.mmap.flush() + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn get(&self, slot: usize) -> bool { + (self.mmap[HEADER_SIZE + (slot >> 3)] >> (slot & 7)) & 1 != 0 + } + + pub fn set(&mut self, slot: usize, value: bool) { + let bit = 1u64 << (slot & 63); + if value { self.data_words_mut()[slot >> 6] |= bit; } + else { self.data_words_mut()[slot >> 6] &= !bit; } + } + + fn data_words(&self) -> &[u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_ptr() as *const u64; + unsafe { std::slice::from_raw_parts(ptr, nw) } + } + + // SAFETY: same alignment argument as PersistentBitVec::data_words. + fn data_words_mut(&mut self) -> &mut [u64] { + let nw = n_words(self.n); + let ptr = self.mmap[HEADER_SIZE..].as_mut_ptr() as *mut u64; + unsafe { std::slice::from_raw_parts_mut(ptr, nw) } + } + + pub fn view(&self) -> BitSliceView<'_> { + BitSliceView::new(self.data_words(), self.n) + } + + pub fn words(&self) -> &[u64] { self.data_words() } + + pub fn copy_from(&mut self, src: BitSliceView<'_>) { + assert_eq!(self.n, src.len(), "BitSliceView length mismatch"); + self.data_words_mut().copy_from_slice(src.words()); + } + + pub fn and(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w &= o; } + } + + pub fn or(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w |= o; } + } + + pub fn xor(&mut self, other: BitSliceView<'_>) { + assert_eq!(self.n, other.len(), "BitSliceView length mismatch"); + for (w, &o) in self.data_words_mut().iter_mut().zip(other.words()) { *w ^= o; } + } + + pub fn not(&mut self) { + let rem = self.n % 64; + let words = self.data_words_mut(); + for w in words.iter_mut() { *w ^= u64::MAX; } + if rem != 0 { + if let Some(last) = words.last_mut() { *last &= (1u64 << rem) - 1; } + } + } + + /// OR in bits at slots where `pred(col[slot])` is true. + pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + assert_eq!(self.n, col.len(), "IntSliceView length mismatch"); + let n = self.n; + let primary = col.primary_bytes(); + let words = self.data_words_mut(); + let nw = n_words(n); + for wi in 0..nw { + let base = wi * 64; + let limit = (base + 64).min(n); + let mut mask = 0u64; + for bit in 0..(limit - base) { + let b = primary[base + bit]; + if b < 255 && pred(b as u32) { mask |= 1u64 << bit; } + } + words[wi] |= mask; + } + for (slot, val) in col.overflow_entries() { + if pred(val) { words[slot >> 6] |= 1u64 << (slot & 63); } + } + } + + /// Clear bits at slots where `pred(col[slot])` is false. + pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + assert_eq!(self.n, col.len(), "IntSliceView length mismatch"); + let n = self.n; + let primary = col.primary_bytes(); + let words = self.data_words_mut(); + let nw = n_words(n); + for wi in 0..nw { + let base = wi * 64; + let limit = (base + 64).min(n); + let mut mask = 0u64; + for bit in 0..(limit - base) { + let b = primary[base + bit]; + if b < 255 && !pred(b as u32) { mask |= 1u64 << bit; } + } + words[wi] &= !mask; + } + for (slot, val) in col.overflow_entries() { + if !pred(val) { words[slot >> 6] &= !(1u64 << (slot & 63)); } + } + } + + /// Toggle bits at slots where `pred(col[slot])` is true. + pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + assert_eq!(self.n, col.len(), "IntSliceView length mismatch"); + let n = self.n; + let primary = col.primary_bytes(); + let words = self.data_words_mut(); + let nw = n_words(n); + for wi in 0..nw { + let base = wi * 64; + let limit = (base + 64).min(n); + let mut mask = 0u64; + for bit in 0..(limit - base) { + let b = primary[base + bit]; + if b < 255 && pred(b as u32) { mask |= 1u64 << bit; } + } + words[wi] ^= mask; + } + for (slot, val) in col.overflow_entries() { + if pred(val) { words[slot >> 6] ^= 1u64 << (slot & 63); } + } + } + + pub fn iter(&self) -> BitSliceIter<'_> { + self.view().iter() + } + + pub fn close(self) -> io::Result<()> { self.mmap.flush() } + + pub fn finish(self) -> io::Result { + let path = self.path.clone(); + self.close()?; + PersistentBitVec::open(&path) } } diff --git a/src/obicompactvec/src/builder.rs b/src/obicompactvec/src/builder.rs index 32d711f..266b3c1 100644 --- a/src/obicompactvec/src/builder.rs +++ b/src/obicompactvec/src/builder.rs @@ -5,71 +5,57 @@ use std::path::{Path, PathBuf}; use memmap2::MmapMut; -use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE, finalize_pciv}; +use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, finalize_pciv, parse_overflow_entry}; use crate::reader::PersistentCompactIntVec; +use crate::views::{BitSliceView, IntSliceView}; pub struct PersistentCompactIntVecBuilder { - path: PathBuf, - mmap: MmapMut, - n: usize, + path: PathBuf, + mmap: MmapMut, + n: usize, overflow: HashMap, } impl PersistentCompactIntVecBuilder { - /// Create a new, zero-filled PCIV at `path`. Primary is mmapped immediately. pub fn new(n: usize, path: &Path) -> io::Result { let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) + .read(true).write(true).create(true).truncate(true) .open(path)?; file.set_len((HEADER_SIZE + n) as u64)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - Ok(Self { - path: path.to_path_buf(), - mmap, - n, - overflow: HashMap::new(), - }) + Ok(Self { path: path.to_path_buf(), mmap, n, overflow: HashMap::new() }) + } + + pub fn from_raw_primary(primary: &[u8], overflow: HashMap, path: &Path) -> io::Result { + let n = primary.len(); + let file = OpenOptions::new() + .read(true).write(true).create(true).truncate(true) + .open(path)?; + file.set_len((HEADER_SIZE + n) as u64)?; + let mut mmap = unsafe { MmapMut::map_mut(&file)? }; + mmap[HEADER_SIZE..HEADER_SIZE + n].copy_from_slice(primary); + Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) } - /// Copy `source`'s file to `path`, mmap the primary section, load overflow into RAM. - /// Avoids iterating all n slots: the file copy is OS-level, overflow loading is O(n_overflow). pub fn build_from(source: &PersistentCompactIntVec, path: &Path) -> io::Result { fs::copy(source.path(), path)?; - let file = OpenOptions::new().read(true).write(true).open(path)?; let mmap = unsafe { MmapMut::map_mut(&file)? }; - - let n = source.len(); + let n = source.len(); let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; let data_offset = HEADER_SIZE + n; - let mut overflow = HashMap::with_capacity(n_overflow); for i in 0..n_overflow { - let off = data_offset + i * OVERFLOW_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; - let value = u32::from_le_bytes(mmap[off + 8..off + 12].try_into().unwrap()); + let (slot, value) = parse_overflow_entry(&mmap, data_offset, i); overflow.insert(slot, value); } - - Ok(Self { - path: path.to_path_buf(), - mmap, - n, - overflow, - }) + Ok(Self { path: path.to_path_buf(), mmap, n, overflow }) } - /// Get the value at the given slot, handling overflow if necessary. pub fn get(&self, slot: usize) -> u32 { match self.mmap[HEADER_SIZE + slot] { - 255 => *self - .overflow - .get(&slot) - .expect("sentinel without overflow entry"), - v => v as u32, + 255 => *self.overflow.get(&slot).expect("sentinel without overflow entry"), + v => v as u32, } } @@ -83,61 +69,201 @@ impl PersistentCompactIntVecBuilder { } } - pub fn len(&self) -> usize { - self.n + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + + pub fn primary_bytes(&self) -> &[u8] { &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } + pub fn primary_bytes_mut(&mut self) -> &mut [u8] { &mut self.mmap[HEADER_SIZE..HEADER_SIZE + self.n] } + pub fn clear_overflow(&mut self) { self.overflow.clear(); } + + pub fn sum(&self) -> u64 { + byte_sum(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n], self.overflow.values().copied()) + } + pub fn count_nonzero(&self) -> u64 { + byte_count_nonzero(&self.mmap[HEADER_SIZE..HEADER_SIZE + self.n]) } - pub fn is_empty(&self) -> bool { - self.n == 0 + pub fn view(&self) -> IntSliceView<'_> { + // Builder overflow is a HashMap, not sorted raw bytes — convert on the fly + // by collecting into a sorted vec and storing in a thread-local buffer. + // For read-back during building, just call get(slot) directly. + // view() is primarily useful AFTER freeze (on PersistentCompactIntVec). + // Here we expose it via a zero-alloc path: primary only, no overflow raw. + // Callers that need overflow_entries during building use overflow_entries(). + let primary = &self.mmap[HEADER_SIZE..HEADER_SIZE + self.n]; + IntSliceView::new(primary, &[], 0, self.n) } - pub fn min(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - if other_val < self.get(slot) { - self.set(slot, other_val); + pub fn overflow_entries(&self) -> impl Iterator + '_ { + self.overflow.iter().map(|(&k, &v)| (k, v)) + } + + pub fn inc(&mut self, slot: usize) { + let v = self.get(slot); + self.set(slot, v.saturating_add(1)); + } + + // ── Computation methods ─────────────────────────────────────────────────── + + /// Increment one counter per 1-bit of `col`. Safe for any group size. + pub fn inc_present(&mut self, col: BitSliceView<'_>) { + let n = self.n; + for (wi, &word) in col.words().iter().enumerate() { + if word == 0 { continue; } + let mut w = word; + while w != 0 { + let bit = w.trailing_zeros() as usize; + let slot = wi * 64 + bit; + if slot < n { self.inc(slot); } + w &= w - 1; } } } - pub fn max(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - if other_val > self.get(slot) { - self.set(slot, other_val); + /// Increment one counter per 1-bit of `col`, using raw u8 arithmetic. + /// Caller guarantees no counter will reach 255 (group size < 255). + pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) { + { + let primary = self.primary_bytes_mut(); + let n = primary.len(); + for (wi, &word) in col.words().iter().enumerate() { + if word == 0 { continue; } + let mut w = word; + while w != 0 { + let bit = w.trailing_zeros() as usize; + let s = wi * 64 + bit; + if s < n { primary[s] += 1; } + w &= w - 1; + } + } + } + debug_assert!( + !self.primary_bytes().contains(&255), + "sentinel 255 reached in inc_present_fast — group size must be < 255" + ); + } + + /// Two-pass: primary bytes then overflow. Increments `self[slot]` for each + /// slot where `pred(col[slot])` is true. Safe for any group size. + pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + let n = col.len(); + for slot in 0..n { + let b = col.primary_bytes()[slot]; + if b < 255 && pred(b as u32) { + self.inc(slot); + } + } + for (slot, val) in col.overflow_entries() { + if pred(val) { self.inc(slot); } + } + } + + /// Fast two-pass: raw u8 arithmetic. Caller guarantees no counter reaches 255. + pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + let n = col.len(); + { + let primary = self.primary_bytes_mut(); + for slot in 0..n { + let b = col.primary_bytes()[slot]; + if b < 255 && pred(b as u32) { + primary[slot] += 1; + } + } + } + for (slot, val) in col.overflow_entries() { + if pred(val) { self.primary_bytes_mut()[slot] += 1; } + } + debug_assert!( + !self.primary_bytes().contains(&255), + "sentinel 255 reached in inc_predicate_fast — group size must be < 255" + ); + } + + pub fn add(&mut self, other: IntSliceView<'_>) { + let n = self.n; + for s in 0..n { + let sb = self.primary_bytes()[s]; + let ob = other.primary_bytes()[s]; + if sb < 255 && ob < 255 { + let sum = sb as u32 + ob as u32; + if sum < 255 { self.primary_bytes_mut()[s] = sum as u8; } + else { self.set(s, sum); } + } else { + let sv = self.get(s); + let ov = other.get(s); + self.set(s, sv + ov); } } } - pub fn add(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - let cur = self.get(slot); - self.set(slot, cur.checked_add(other_val).expect("u32 overflow in add")); + pub fn min(&mut self, other: IntSliceView<'_>) { + let self_ov: Vec<(usize, u32)> = self.overflow_entries().collect(); + let other_ov: HashMap = other.overflow_entries().collect(); + self.clear_overflow(); + for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { + if b < *a { *a = b; } + } + for (slot, self_val) in self_ov { + if let Some(&other_val) = other_ov.get(&slot) { + self.set(slot, self_val.min(other_val)); + } } } - pub fn diff(&mut self, other: &PersistentCompactIntVec) { - assert_eq!(self.n, other.len(), "length mismatch"); - for (slot, other_val) in other.iter().enumerate() { - self.set(slot, self.get(slot).saturating_sub(other_val)); + pub fn max(&mut self, other: IntSliceView<'_>) { + for (slot, other_val) in other.overflow_entries() { + let sv = self.get(slot); + self.set(slot, sv.max(other_val)); + } + for (a, &b) in self.primary_bytes_mut().iter_mut().zip(other.primary_bytes()) { + if b > *a { *a = b; } + } + } + + pub fn diff(&mut self, other: IntSliceView<'_>) { + let n = self.n; + for s in 0..n { + let sb = self.primary_bytes()[s]; + let ob = other.primary_bytes()[s]; + if sb < 255 { + self.primary_bytes_mut()[s] = if ob < 255 { sb.saturating_sub(ob) } else { 0 }; + } else { + let sv = self.get(s); + let ov = if ob < 255 { ob as u32 } else { other.get(s) }; + self.set(s, sv.saturating_sub(ov)); + } + } + } + + pub fn mask_with(&mut self, mask: BitSliceView<'_>) { + let n = self.n; + for (wi, &word) in mask.words().iter().enumerate() { + if word == u64::MAX { continue; } + let mut zeros = !word; + while zeros != 0 { + let bit = zeros.trailing_zeros() as usize; + let s = wi * 64 + bit; + if s < n { + let b = self.primary_bytes()[s]; + if b != 0 { self.set(s, 0); } + } + zeros &= zeros - 1; + } } } - /// Flush the primary mmap, then write sorted overflow data + index and fix the header. pub fn close(self) -> io::Result<()> { self.mmap.flush()?; - let Self { - path, - mmap, - n, - overflow, - } = self; + let Self { path, mmap, n, overflow } = self; drop(mmap); - let mut entries: Vec<(usize, u32)> = overflow.into_iter().collect(); entries.sort_unstable_by_key(|&(slot, _)| slot); - finalize_pciv(&path, n, &entries) } + + pub fn finish(self) -> io::Result { + let path = self.path.clone(); + self.close()?; + PersistentCompactIntVec::open(&path) + } } diff --git a/src/obicompactvec/src/colgroup.rs b/src/obicompactvec/src/colgroup.rs new file mode 100644 index 0000000..49ca477 --- /dev/null +++ b/src/obicompactvec/src/colgroup.rs @@ -0,0 +1,137 @@ +use std::io; + +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::TempCompactIntVec; + +// ── ColGroup ────────────────────────────────────────────────────────────────── + +/// A named subset of columns, identified by their indices within the matrix. +/// +/// Defined once at the index level; the same indices are valid across all +/// partitions and layers because the column structure (samples / genomes) is +/// identical everywhere — only the row space (kmer slots) is partitioned. +pub struct ColGroup { + pub name: String, + pub indices: Vec, +} + +impl ColGroup { + pub fn new(name: impl Into, indices: Vec) -> Self { + Self { name: name.into(), indices } + } +} + +// ── MatrixGroupOps ──────────────────────────────────────────────────────────── + +/// Per-matrix group aggregations. +/// +/// `partial_group_presence_count`, `partial_group_sum`, `partial_group_any`, +/// `partial_group_min`, `partial_group_max` are the primitives; each impl must +/// provide all five. +/// +/// `partial_group_all` and `partial_group_none` have default implementations +/// derived from `partial_group_presence_count` and should rarely need overriding. +pub trait MatrixGroupOps { + /// Per-slot count of group columns whose value ≥ `threshold`. + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result; + + /// Per-slot sum of values across all group columns. + fn partial_group_sum(&self, g: &ColGroup) -> io::Result; + + /// Per-slot OR: 1 if any group column has value ≥ `threshold`. + fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result; + + /// Per-slot min value across all group columns (0 if group is empty). + fn partial_group_min(&self, g: &ColGroup) -> io::Result; + + /// Per-slot max value across all group columns (0 if group is empty). + fn partial_group_max(&self, g: &ColGroup) -> io::Result; + + /// Per-slot AND: 1 if ALL group columns have value ≥ `threshold`. + fn partial_group_all(&self, g: &ColGroup, threshold: u32) -> io::Result { + let counts = self.partial_group_presence_count(g, threshold)?; + let n = counts.len(); + let n_required = g.indices.len() as u32; + let mut b = TempBitVecBuilder::new(n)?; + b.or_where(counts.view(), |v| v >= n_required); + b.freeze() + } + + /// Per-slot NOR: 1 if NO group column has value ≥ `threshold`. + fn partial_group_none(&self, g: &ColGroup, threshold: u32) -> io::Result { + let counts = self.partial_group_presence_count(g, threshold)?; + let n = counts.len(); + let mut b = TempBitVecBuilder::new(n)?; + b.or_where(counts.view(), |v| v == 0); + b.freeze() + } +} + +// ── FilterMask — expression tree for column-based slot filters ──────────────── + +/// A composable filter expression that can be evaluated against a matrix +/// using only column operations (no MPHF lookup per kmer). +/// +/// `threshold` semantics follow [`MatrixGroupOps::partial_group_presence_count`]: +/// a slot contributes to the count when its value is **≥ threshold**. +/// To match the row-level filter (`value > t`), callers should pass `t + 1`. +#[derive(Debug, Clone)] +pub enum FilterMask { + /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≥ `min_count`. + PresenceGeq { indices: Vec, threshold: u32, min_count: usize }, + /// Slot passes if count of columns in `indices` with value ≥ `threshold` is ≤ `max_count`. + PresenceLeq { indices: Vec, threshold: u32, max_count: usize }, + /// Slot passes if sum of values across `indices` columns is ≥ `min_sum`. + SumGeq { indices: Vec, min_sum: u32 }, + /// Slot passes if sum of values across `indices` columns is ≤ `max_sum`. + SumLeq { indices: Vec, max_sum: u32 }, + /// Slot passes if it passes all sub-expressions. Empty `And` is always true. + And(Vec), +} + +/// Evaluate a [`FilterMask`] against `mat`, returning a per-slot `TempBitVec` +/// where bit=1 means the slot passes the filter. +pub fn eval_filter_mask(expr: &FilterMask, mat: &dyn MatrixGroupOps, n: usize) -> io::Result { + match expr { + FilterMask::PresenceGeq { indices, threshold, min_count } => { + let g = ColGroup::new("", indices.clone()); + let counts = mat.partial_group_presence_count(&g, *threshold)?; + let mut b = TempBitVecBuilder::new(n)?; + let mc = *min_count as u32; + b.or_where(counts.view(), |v| v >= mc); + b.freeze() + } + FilterMask::PresenceLeq { indices, threshold, max_count } => { + let g = ColGroup::new("", indices.clone()); + let counts = mat.partial_group_presence_count(&g, *threshold)?; + let mut b = TempBitVecBuilder::new(n)?; + let mc = *max_count as u32; + b.or_where(counts.view(), |v| v <= mc); + b.freeze() + } + FilterMask::SumGeq { indices, min_sum } => { + let g = ColGroup::new("", indices.clone()); + let sums = mat.partial_group_sum(&g)?; + let mut b = TempBitVecBuilder::new(n)?; + let ms = *min_sum; + b.or_where(sums.view(), |v| v >= ms); + b.freeze() + } + FilterMask::SumLeq { indices, max_sum } => { + let g = ColGroup::new("", indices.clone()); + let sums = mat.partial_group_sum(&g)?; + let mut b = TempBitVecBuilder::new(n)?; + let ms = *max_sum; + b.or_where(sums.view(), |v| v <= ms); + b.freeze() + } + FilterMask::And(parts) => { + let mut b = TempBitVecBuilder::new_ones(n)?; + for part in parts { + let m = eval_filter_mask(part, mat, n)?; + b.and(m.view()); + } + b.freeze() + } + } +} diff --git a/src/obicompactvec/src/format.rs b/src/obicompactvec/src/format.rs index 08f0079..b3c24d0 100644 --- a/src/obicompactvec/src/format.rs +++ b/src/obicompactvec/src/format.rs @@ -13,6 +13,44 @@ pub const OVERFLOW_ENTRY_SIZE: usize = 12; // Index entry: slot(u64) + pos(u64) = 16 bytes. pub const INDEX_ENTRY_SIZE: usize = 16; +/// Sum all values in a compact-int primary byte slice, correcting for overflow sentinels. +/// +/// `primary` is the raw `&[u8]` where 255 is a sentinel for large values. +/// `overflow` yields the true values (≥ 255) for each sentinel, in any order. +#[inline] +pub(crate) fn byte_sum(primary: &[u8], overflow: impl Iterator) -> u64 { + let raw: u64 = primary.iter().map(|&b| b as u64).sum(); + let (n, ov) = overflow.fold((0u64, 0u64), |(n, s), v| (n + 1, s + v as u64)); + raw - 255 * n + ov +} + +/// Count non-zero values in a compact-int primary byte slice. +/// +/// Overflow sentinels (255) are always non-zero by construction, so a single +/// `b != 0` test is sufficient — no overflow map lookup needed. +#[inline] +pub(crate) fn byte_count_nonzero(primary: &[u8]) -> u64 { + primary.iter().filter(|&&b| b != 0).count() as u64 +} + +/// Parse a single overflow entry `(slot, value)` from a byte slice. +#[inline] +pub fn parse_overflow_entry(data: &[u8], base: usize, i: usize) -> (usize, u32) { + let off = base + i * OVERFLOW_ENTRY_SIZE; + let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize; + let value = u32::from_le_bytes(data[off+8..off+12].try_into().unwrap()); + (slot, value) +} + +/// Parse a single sparse-index entry `(slot, pos)` from a byte slice. +#[inline] +pub fn parse_index_entry(data: &[u8], base: usize, i: usize) -> (usize, usize) { + let off = base + i * INDEX_ENTRY_SIZE; + let slot = u64::from_le_bytes(data[off..off+8].try_into().unwrap()) as usize; + let pos = u64::from_le_bytes(data[off+8..off+16].try_into().unwrap()) as usize; + (slot, pos) +} + // Sparse index target: ≤ 32 KB in L1 cache (16 B per entry → 2048 entries). pub const L1_INDEX_ENTRIES: usize = 2048; diff --git a/src/obicompactvec/src/intmatrix.rs b/src/obicompactvec/src/intmatrix.rs index b563335..b2fa97e 100644 --- a/src/obicompactvec/src/intmatrix.rs +++ b/src/obicompactvec/src/intmatrix.rs @@ -1,4 +1,3 @@ -use std::cmp::Ordering; use std::fs::{self, File}; use std::io::{self, BufWriter, Write as _}; use std::path::{Path, PathBuf}; @@ -7,10 +6,15 @@ use memmap2::Mmap; use ndarray::{Array1, Array2}; use rayon::prelude::*; +use crate::bitmatrix::{pairwise_matrix, pairwise2_matrix}; use crate::builder::PersistentCompactIntVecBuilder; -use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, OVERFLOW_ENTRY_SIZE}; +use crate::colgroup::{ColGroup, MatrixGroupOps}; +use crate::format::{HEADER_SIZE, OVERFLOW_ENTRY_SIZE}; use crate::meta::MatrixMeta; use crate::reader::PersistentCompactIntVec; +use crate::tempbitvec::{TempBitVec, TempBitVecBuilder}; +use crate::tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; +use crate::views::IntSliceView; fn col_path(dir: &Path, col: usize) -> PathBuf { dir.join(format!("col_{col:06}.pciv")) @@ -41,9 +45,7 @@ impl ColumnarCompactIntMatrix { } pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) { - for (c, col) in self.cols.iter().enumerate() { - buf[c] = col.get(slot); - } + for (c, col) in self.cols.iter().enumerate() { buf[c] = col.get(slot); } } pub(crate) fn sum(&self) -> Array1 { @@ -63,49 +65,26 @@ impl ColumnarCompactIntMatrix { } pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.col(i).partial_bray_dist(self.col(j))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_bray_dist(self.col(j))) } - pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.col(i).partial_euclidean_dist(self.col(j))) + pairwise_matrix(self.n_cols(), |i, j| self.col(i).partial_euclidean_dist(self.col(j))) } - - pub(crate) fn partial_threshold_jaccard_dist_matrix( - &self, threshold: u32, - ) -> (Array2, Array2) { - let n = self.n_cols(); - let pairs = upper_pairs(n); - let results: Vec<(usize, usize, u64, u64)> = pairs - .into_par_iter() - .map(|(i, j)| { - let (inter, union) = - self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold); - (i, j, inter, union) - }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, threshold: u32) -> (Array2, Array2) { + pairwise2_matrix(self.n_cols(), |i, j| self.col(i).partial_threshold_jaccard_dist(self.col(j), threshold)) } - pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_bray_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } - pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_relfreq_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } - pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| { + pairwise_matrix(self.n_cols(), |i, j| { self.col(i).partial_hellinger_euclidean_dist(self.col(j), col_sums[i] as f64, col_sums[j] as f64) }) } @@ -118,20 +97,6 @@ impl ColumnarCompactIntMatrix { meta.n_cols += 1; meta.save(dir) } - - fn pairwise(&self, f: impl Fn(usize, usize) -> f64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, f64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols(); - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) - } } // ── PackedCompactIntMatrix ──────────────────────────────────────────────────── @@ -139,13 +104,10 @@ impl ColumnarCompactIntMatrix { const PCMX_MAGIC: [u8; 4] = *b"PCMX"; const PCMX_HEADER: usize = 24; // magic(4) + pad(4) + n_rows(8) + n_cols(8) -/// Per-column metadata pre-parsed from the embedded PCIV header. struct ColInfo { - primary_start: usize, // absolute mmap offset to primary array - data_offset: usize, // absolute mmap offset to overflow array + primary_start: usize, + data_offset: usize, n_overflow: usize, - step: usize, - index: Vec<(usize, usize)>, } pub struct PackedCompactIntMatrix { @@ -171,61 +133,31 @@ impl PackedCompactIntMatrix { for c in 0..n_cols { let off_pos = PCMX_HEADER + c * 8; let col_base = u64::from_le_bytes(mmap[off_pos..off_pos+8].try_into().unwrap()) as usize; - // Parse embedded PCIV header at col_base - let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize; - let n_idx = u64::from_le_bytes(mmap[col_base+24..col_base+32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[col_base+32..col_base+40].try_into().unwrap()) as usize; - let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize; - + let n_ov = u64::from_le_bytes(mmap[col_base+16..col_base+24].try_into().unwrap()) as usize; + let n_pciv = u64::from_le_bytes(mmap[col_base+8..col_base+16].try_into().unwrap()) as usize; let primary_start = col_base + HEADER_SIZE; let data_offset = primary_start + n_pciv; - let index_offset = data_offset + n_ov * OVERFLOW_ENTRY_SIZE; - - let mut index = Vec::with_capacity(n_idx); - for i in 0..n_idx { - let ioff = index_offset + i * INDEX_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[ioff..ioff+8].try_into().unwrap()) as usize; - let pos = u64::from_le_bytes(mmap[ioff+8..ioff+16].try_into().unwrap()) as usize; - index.push((slot, pos)); - } - columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov, step, index }); + columns.push(ColInfo { primary_start, data_offset, n_overflow: n_ov }); } - Ok(Self { mmap, n_rows, n_cols, columns }) } - #[inline] - pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { - let ci = &self.columns[col]; - let v = self.mmap[ci.primary_start + slot]; - if v < 255 { return v as u32; } - self.overflow_get(ci, slot) + pub(crate) fn col_view(&self, c: usize) -> IntSliceView<'_> { + let ci = &self.columns[c]; + let primary = &self.mmap[ci.primary_start..ci.primary_start + self.n_rows]; + let overflow_raw = &self.mmap[ci.data_offset..ci.data_offset + ci.n_overflow * OVERFLOW_ENTRY_SIZE]; + IntSliceView::new(primary, overflow_raw, ci.n_overflow, self.n_rows) } - fn overflow_get(&self, ci: &ColInfo, slot: usize) -> u32 { - let (pos_start, pos_end) = if ci.step == 0 { - (0, ci.n_overflow) - } else { - let i = ci.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); - let start = ci.index[i].1; - let end = if i + 1 < ci.index.len() { ci.index[i+1].1 } else { ci.n_overflow }; - (start, end) - }; - let mut lo = pos_start; - let mut hi = pos_end; - while lo < hi { - let mid = lo + (hi - lo) / 2; - let off = ci.data_offset + mid * OVERFLOW_ENTRY_SIZE; - let stored = u64::from_le_bytes(self.mmap[off..off+8].try_into().unwrap()) as usize; - match stored.cmp(&slot) { - Ordering::Equal => return u32::from_le_bytes(self.mmap[off+8..off+12].try_into().unwrap()), - Ordering::Less => lo = mid + 1, - Ordering::Greater => hi = mid, - } - } - panic!("slot {slot} marked overflow but not found") + pub(crate) fn col_persist(&self, c: usize, path: &Path) -> io::Result { + let view = self.col_view(c); + let overflow: std::collections::HashMap = view.overflow_entries().collect(); + PersistentCompactIntVecBuilder::from_raw_primary(view.primary_bytes(), overflow, path) } + #[inline] + pub(crate) fn get(&self, col: usize, slot: usize) -> u32 { self.col_view(col).get(slot) } + pub(crate) fn fill_row(&self, slot: usize, buf: &mut [u32]) { for c in 0..self.n_cols { buf[c] = self.get(c, slot); } } @@ -236,152 +168,85 @@ impl PackedCompactIntMatrix { pub(crate) fn sum(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| (0..self.n_rows).map(|s| self.get(c, s) as u64).sum()) - .collect() + (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).sum()).collect() ) } pub(crate) fn count_nonzero(&self) -> Array1 { Array1::from_vec( - (0..self.n_cols).into_par_iter() - .map(|c| (0..self.n_rows).filter(|&s| self.get(c, s) > 0).count() as u64) - .collect() + (0..self.n_cols).into_par_iter().map(|c| self.col_view(c).count_nonzero()).collect() ) } - // ── Pair primitives ─────────────────────────────────────────────────────── - fn pair_partial_bray(&self, i: usize, j: usize) -> u64 { - (0..self.n_rows).map(|s| self.get(i, s).min(self.get(j, s)) as u64).sum() + self.col_view(i).iter().zip(self.col_view(j).iter()).map(|(a, b)| a.min(b) as u64).sum() } - fn pair_partial_euclidean(&self, i: usize, j: usize) -> f64 { - (0..self.n_rows).map(|s| { - let d = self.get(i, s) as f64 - self.get(j, s) as f64; - d * d - }).sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }).sum() } - fn pair_partial_threshold_jaccard(&self, i: usize, j: usize, t: u32) -> (u64, u64) { - let (mut inter, mut union) = (0u64, 0u64); - for s in 0..self.n_rows { - let a = self.get(i, s) >= t; - let b = self.get(j, s) >= t; - if a && b { inter += 1; } - if a || b { union += 1; } - } - (inter, union) + self.col_view(i).iter().zip(self.col_view(j).iter()) + .fold((0u64, 0u64), |(inter, uni), (a, b)| { + let ap = a >= t; let bp = b >= t; + (inter + (ap & bp) as u64, uni + (ap | bp) as u64) + }) } - fn pair_partial_relfreq_bray(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - (self.get(i, s) as f64 / si).min(self.get(j, s) as f64 / sj) - }).sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| (a as f64 / si).min(b as f64 / sj)).sum() } - fn pair_partial_relfreq_euclidean(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - let d = self.get(i, s) as f64 / si - self.get(j, s) as f64 / sj; - d * d - }).sum() + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = a as f64 / si - b as f64 / sj; d * d }).sum() } - fn pair_partial_hellinger(&self, i: usize, j: usize, si: f64, sj: f64) -> f64 { if si == 0.0 || sj == 0.0 { return 0.0; } - (0..self.n_rows).map(|s| { - let d = (self.get(i, s) as f64 / si).sqrt() - (self.get(j, s) as f64 / sj).sqrt(); - d * d - }).sum() - } - - // ── Matrix methods ──────────────────────────────────────────────────────── - - fn pairwise(&self, f: impl Fn(usize, usize) -> T + Sync) -> Array2 - where T: Clone + Default + Send { - let n = self.n_cols; - let results: Vec<(usize, usize, T)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| { let w = v.clone(); (i, j, v, w) })) - } - - fn pairwise_u64(&self, f: impl Fn(usize, usize) -> u64 + Sync) -> Array2 { - let n = self.n_cols; - let results: Vec<(usize, usize, u64)> = upper_pairs(n) - .into_par_iter().map(|(i, j)| (i, j, f(i, j))).collect(); - fill_symmetric(n, results.into_iter().map(|(i, j, v)| (i, j, v, v))) + self.col_view(i).iter().zip(self.col_view(j).iter()) + .map(|(a, b)| { let d = (a as f64 / si).sqrt() - (b as f64 / sj).sqrt(); d * d }).sum() } pub(crate) fn partial_bray_dist_matrix(&self) -> Array2 { - self.pairwise_u64(|i, j| self.pair_partial_bray(i, j)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_bray(i, j)) } - - pub(crate) fn partial_euclidean_dist_matrix(&self) -> Array2 { - self.pairwise(|i, j| self.pair_partial_euclidean(i, j)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_euclidean(i, j)) } - pub(crate) fn partial_threshold_jaccard_dist_matrix(&self, t: u32) -> (Array2, Array2) { - let n = self.n_cols; - let results: Vec<(usize, usize, u64, u64)> = upper_pairs(n) - .into_par_iter() - .map(|(i, j)| { let (inter, union) = self.pair_partial_threshold_jaccard(i, j, t); (i, j, inter, union) }) - .collect(); - let mut inter_m = Array2::zeros((n, n)); - let mut union_m = Array2::zeros((n, n)); - for (i, j, inter, union) in results { - inter_m[[i, j]] = inter; inter_m[[j, i]] = inter; - union_m[[i, j]] = union; union_m[[j, i]] = union; - } - (inter_m, union_m) + pairwise2_matrix(self.n_cols, |i, j| self.pair_partial_threshold_jaccard(i, j, t)) } - pub(crate) fn partial_relfreq_bray_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_bray(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn partial_relfreq_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_relfreq_euclidean(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - pub(crate) fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { - self.pairwise(|i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) + pairwise_matrix(self.n_cols, |i, j| self.pair_partial_hellinger(i, j, col_sums[i] as f64, col_sums[j] as f64)) } - } /// Build `counts/matrix.pcmx` from existing `col_*.pciv` files. pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { let packed_path = dir.join("matrix.pcmx"); if packed_path.exists() { - // Matrix complete; remove any leftover column files from a killed cleanup. if let Ok(meta) = MatrixMeta::load(dir) { for c in 0..meta.n_cols { let _ = fs::remove_file(col_path(dir, c)); } let _ = fs::remove_file(dir.join("meta.json")); } return Ok(()); } - - let meta = MatrixMeta::load(dir)?; + let meta = MatrixMeta::load(dir)?; let n_cols = meta.n_cols; - - // Compute offsets from file sizes — no column data loaded into RAM. let col_sizes: Vec = (0..n_cols) .map(|c| fs::metadata(col_path(dir, c)).map(|m| m.len())) .collect::>()?; - let header_size = (PCMX_HEADER + n_cols * 8) as u64; let mut col_offset = header_size; let mut offsets = Vec::with_capacity(n_cols); - for &size in &col_sizes { - offsets.push(col_offset); - col_offset += size; - } - - // Write to a temp file; rename atomically so a killed process never leaves - // a truncated matrix.pcmx that would be mistaken for a complete file. + for &size in &col_sizes { offsets.push(col_offset); col_offset += size; } let tmp_path = dir.join("matrix.pcmx.tmp"); let mut out = BufWriter::new(File::create(&tmp_path)?); out.write_all(&PCMX_MAGIC)?; @@ -389,13 +254,10 @@ pub fn pack_compact_int_matrix(dir: &Path) -> io::Result<()> { out.write_all(&(meta.n as u64).to_le_bytes())?; out.write_all(&(n_cols as u64).to_le_bytes())?; for &off in &offsets { out.write_all(&off.to_le_bytes())?; } - for c in 0..n_cols { - io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; - } + for c in 0..n_cols { io::copy(&mut File::open(col_path(dir, c))?, &mut out)?; } out.flush()?; drop(out); fs::rename(&tmp_path, &packed_path)?; - for c in 0..n_cols { fs::remove_file(col_path(dir, c))?; } fs::remove_file(dir.join("meta.json"))?; Ok(()) @@ -409,18 +271,14 @@ pub enum PersistentCompactIntMatrix { } impl PersistentCompactIntMatrix { - /// Open from `layer_dir`, auto-detecting Packed or Columnar. pub fn open(layer_dir: &Path) -> io::Result { let counts_dir = layer_dir.join("counts"); - if counts_dir.join("matrix.pcmx").exists() { return Ok(Self::Packed(PackedCompactIntMatrix::open(&counts_dir.join("matrix.pcmx"))?)); } - if MatrixMeta::load(&counts_dir).is_ok() { return Ok(Self::Columnar(ColumnarCompactIntMatrix::open(&counts_dir)?)); } - Err(io::Error::new( io::ErrorKind::NotFound, format!("no count matrix found in {} — run 'obikmer upgrade'", layer_dir.display()), @@ -430,7 +288,6 @@ impl PersistentCompactIntMatrix { pub fn n(&self) -> usize { match self { Self::Columnar(m) => m.n(), Self::Packed(m) => m.n_rows } } - pub fn n_cols(&self) -> usize { match self { Self::Columnar(m) => m.n_cols(), Self::Packed(m) => m.n_cols } } @@ -442,22 +299,32 @@ impl PersistentCompactIntMatrix { } } + pub fn col_view(&self, c: usize) -> IntSliceView<'_> { + match self { + Self::Columnar(m) => m.col(c).view(), + Self::Packed(m) => m.col_view(c), + } + } + + pub fn col_persist(&self, c: usize, path: &Path) -> io::Result { + match self { + Self::Columnar(m) => PersistentCompactIntVecBuilder::build_from(m.col(c), path), + Self::Packed(m) => m.col_persist(c, path), + } + } + pub fn row(&self, slot: usize) -> Box<[u32]> { match self { Self::Columnar(m) => m.row(slot), Self::Packed(m) => m.row(slot) } } - pub fn fill_row(&self, slot: usize, buf: &mut [u32]) { match self { Self::Columnar(m) => m.fill_row(slot, buf), Self::Packed(m) => m.fill_row(slot, buf) } } - pub fn sum(&self) -> Array1 { match self { Self::Columnar(m) => m.sum(), Self::Packed(m) => m.sum() } } - pub fn count_nonzero(&self) -> Array1 { match self { Self::Columnar(m) => m.count_nonzero(), Self::Packed(m) => m.count_nonzero() } } - pub fn partial_bray_dist_matrix(&self) -> Array2 { match self { Self::Columnar(m) => m.partial_bray_dist_matrix(), Self::Packed(m) => m.partial_bray_dist_matrix() } } @@ -476,7 +343,6 @@ impl PersistentCompactIntMatrix { pub fn partial_hellinger_euclidean_dist_matrix(&self, col_sums: &Array1) -> Array2 { match self { Self::Columnar(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums), Self::Packed(m) => m.partial_hellinger_euclidean_dist_matrix(col_sums) } } - pub fn append_column(dir: &Path, value_of: impl Fn(usize) -> u32) -> io::Result<()> { ColumnarCompactIntMatrix::append_column(dir, value_of) } @@ -492,12 +358,12 @@ impl ColumnWeights for PersistentCompactIntMatrix { } impl CountPartials for PersistentCompactIntMatrix { - fn partial_bray(&self) -> Array2 { self.partial_bray_dist_matrix() } - fn partial_euclidean(&self) -> Array2 { self.partial_euclidean_dist_matrix() } + fn partial_bray(&self) -> Array2 { self.partial_bray_dist_matrix() } + fn partial_euclidean(&self) -> Array2 { self.partial_euclidean_dist_matrix() } fn partial_threshold_jaccard(&self, t: u32) -> (Array2, Array2) { self.partial_threshold_jaccard_dist_matrix(t) } - fn partial_relfreq_bray(&self, g: &Array1) -> Array2 { self.partial_relfreq_bray_dist_matrix(g) } - fn partial_relfreq_euclidean(&self, g: &Array1) -> Array2 { self.partial_relfreq_euclidean_dist_matrix(g) } - fn partial_hellinger(&self, g: &Array1) -> Array2 { self.partial_hellinger_euclidean_dist_matrix(g) } + fn partial_relfreq_bray(&self, g: &Array1) -> Array2 { self.partial_relfreq_bray_dist_matrix(g) } + fn partial_relfreq_euclidean(&self, g: &Array1) -> Array2 { self.partial_relfreq_euclidean_dist_matrix(g) } + fn partial_hellinger(&self, g: &Array1) -> Array2 { self.partial_hellinger_euclidean_dist_matrix(g) } } // ── Builder ─────────────────────────────────────────────────────────────────── @@ -513,30 +379,88 @@ impl PersistentCompactIntMatrixBuilder { fs::create_dir_all(dir)?; Ok(Self { dir: dir.to_path_buf(), n, n_cols: 0 }) } - pub fn n(&self) -> usize { self.n } pub fn n_cols(&self) -> usize { self.n_cols } - pub fn add_col(&mut self) -> io::Result { let path = col_path(&self.dir, self.n_cols); self.n_cols += 1; PersistentCompactIntVecBuilder::new(self.n, &path) } + pub fn add_col_from(&mut self, src: &TempCompactIntVec) -> io::Result<()> { + src.make_persistent(&col_path(&self.dir, self.n_cols))?; + self.n_cols += 1; + Ok(()) + } + + pub fn add_col_from_bit(&mut self, src: &TempBitVec) -> io::Result<()> { + let path = col_path(&self.dir, self.n_cols); + self.n_cols += 1; + let mut b = PersistentCompactIntVecBuilder::new(self.n, &path)?; + b.inc_present(src.view()); + b.close() + } + pub fn close(self) -> io::Result<()> { MatrixMeta { n: self.n, n_cols: self.n_cols }.save(&self.dir) } } -// ── Helpers ─────────────────────────────────────────────────────────────────── +// ── MatrixGroupOps ──────────────────────────────────────────────────────────── -fn upper_pairs(n: usize) -> Vec<(usize, usize)> { - (0..n).flat_map(|i| (i + 1..n).map(move |j| (i, j))).collect() -} +impl MatrixGroupOps for PersistentCompactIntMatrix { + fn partial_group_presence_count(&self, g: &ColGroup, threshold: u32) -> io::Result { + let n = self.n(); + if g.indices.len() < 255 { + let mut builder = TempCompactIntVecBuilder::new(n)?; + for &c in &g.indices { + builder.inc_predicate_fast(self.col_view(c), |v| v >= threshold); + } + builder.freeze() + } else { + let mut result = TempCompactIntVecBuilder::new(n)?; + for chunk in g.indices.chunks(254) { + let mut chunk_b = TempCompactIntVecBuilder::new(n)?; + for &c in chunk { + chunk_b.inc_predicate_fast(self.col_view(c), |v| v >= threshold); + } + let frozen = chunk_b.freeze()?; + result.add(frozen.view()); + } + result.freeze() + } + } -fn fill_symmetric(n: usize, vals: impl Iterator) -> Array2 -where T: Clone + Default { - let mut m = Array2::from_elem((n, n), T::default()); - for (i, j, vij, vji) in vals { m[[i, j]] = vij; m[[j, i]] = vji; } - m + fn partial_group_sum(&self, g: &ColGroup) -> io::Result { + let n = self.n(); + let mut result = TempCompactIntVecBuilder::new(n)?; + for &c in &g.indices { result.add(self.col_view(c)); } + result.freeze() + } + + fn partial_group_any(&self, g: &ColGroup, threshold: u32) -> io::Result { + let n = self.n(); + let mut result = TempBitVecBuilder::new(n)?; + for &c in &g.indices { + result.or_where(self.col_view(c), |v| v >= threshold); + } + result.freeze() + } + + fn partial_group_min(&self, g: &ColGroup) -> io::Result { + let n = self.n(); + let mut result = TempCompactIntVecBuilder::new(n)?; + if let Some((&first, rest)) = g.indices.split_first() { + result.add(self.col_view(first)); + for &c in rest { result.min(self.col_view(c)); } + } + result.freeze() + } + + fn partial_group_max(&self, g: &ColGroup) -> io::Result { + let n = self.n(); + let mut result = TempCompactIntVecBuilder::new(n)?; + for &c in &g.indices { result.max(self.col_view(c)); } + result.freeze() + } } diff --git a/src/obicompactvec/src/layer_meta.rs b/src/obicompactvec/src/layer_meta.rs index 65dc5bc..28fff0c 100644 --- a/src/obicompactvec/src/layer_meta.rs +++ b/src/obicompactvec/src/layer_meta.rs @@ -23,11 +23,6 @@ impl LayerMeta { } fn parse(s: &str) -> Option { - let key = "\"n\":"; - let pos = s.find(key)? + key.len(); - let rest = s[pos..].trim_start(); - let end = rest.find(|c: char| !c.is_ascii_digit()).unwrap_or(rest.len()); - let n = rest[..end].parse().ok()?; - Some(Self { n }) + Some(Self { n: crate::meta::field(s, "n")? }) } } diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index 8a1e5bb..9041ab7 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -1,20 +1,28 @@ mod bitvec; mod bitmatrix; mod builder; +mod colgroup; mod format; mod intmatrix; mod layer_meta; mod meta; mod reader; +mod tempbitvec; +mod tempintvec; +mod views; pub mod traits; pub use bitvec::{BitIter, PersistentBitVec, PersistentBitVecBuilder}; pub use bitmatrix::{PersistentBitMatrix, PersistentBitMatrixBuilder, pack_bit_matrix}; pub use builder::PersistentCompactIntVecBuilder; +pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask}; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; -pub use reader::PersistentCompactIntVec; +pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter}; +pub use tempbitvec::{TempBitVec, TempBitVecBuilder}; +pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; pub use traits::{BitPartials, ColumnWeights, CountPartials}; +pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter}; #[cfg(test)] #[path = "tests/mod.rs"] diff --git a/src/obicompactvec/src/meta.rs b/src/obicompactvec/src/meta.rs index d8d8466..09deedc 100644 --- a/src/obicompactvec/src/meta.rs +++ b/src/obicompactvec/src/meta.rs @@ -23,7 +23,7 @@ fn parse(s: &str) -> Option { Some(MatrixMeta { n: field(s, "n")?, n_cols: field(s, "n_cols")? }) } -fn field(s: &str, name: &str) -> Option { +pub(crate) fn field(s: &str, name: &str) -> Option { let key = format!("\"{}\":", name); let pos = s.find(&key)? + key.len(); let rest = s[pos..].trim_start(); diff --git a/src/obicompactvec/src/reader.rs b/src/obicompactvec/src/reader.rs index 057ce29..f3b1dd6 100644 --- a/src/obicompactvec/src/reader.rs +++ b/src/obicompactvec/src/reader.rs @@ -4,7 +4,8 @@ use std::path::{Path, PathBuf}; use memmap2::Mmap; -use crate::format::{HEADER_SIZE, INDEX_ENTRY_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE}; +use crate::format::{byte_count_nonzero, byte_sum, HEADER_SIZE, MAGIC, OVERFLOW_ENTRY_SIZE, parse_index_entry}; +use crate::views::IntSliceView; pub struct PersistentCompactIntVec { mmap: Mmap, @@ -18,100 +19,60 @@ pub struct PersistentCompactIntVec { } impl PersistentCompactIntVec { - /// Opens a persistent compact int vector from the given path. pub fn open(path: &Path) -> io::Result { let mmap = unsafe { Mmap::map(&File::open(path)?)? }; if mmap.len() < HEADER_SIZE { - return Err(io::Error::new( - io::ErrorKind::InvalidData, - "PCIV file too short", - )); + return Err(io::Error::new(io::ErrorKind::InvalidData, "PCIV file too short")); } if &mmap[0..4] != &MAGIC { return Err(io::Error::new(io::ErrorKind::InvalidData, "bad PCIV magic")); } - let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; + let n = u64::from_le_bytes(mmap[8..16].try_into().unwrap()) as usize; let n_overflow = u64::from_le_bytes(mmap[16..24].try_into().unwrap()) as usize; - let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; - let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; + let n_index = u64::from_le_bytes(mmap[24..32].try_into().unwrap()) as usize; + let step = u64::from_le_bytes(mmap[32..40].try_into().unwrap()) as usize; let primary_offset = HEADER_SIZE; - let data_offset = primary_offset + n; - let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; + let data_offset = primary_offset + n; + let index_offset = data_offset + n_overflow * OVERFLOW_ENTRY_SIZE; let mut index = Vec::with_capacity(n_index); for i in 0..n_index { - let off = index_offset + i * INDEX_ENTRY_SIZE; - let slot = u64::from_le_bytes(mmap[off..off + 8].try_into().unwrap()) as usize; - let pos = u64::from_le_bytes(mmap[off + 8..off + 16].try_into().unwrap()) as usize; - index.push((slot, pos)); + index.push(parse_index_entry(&mmap, index_offset, i)); } - Ok(Self { - mmap, - n, - n_overflow, - step, - index, - primary_offset, - data_offset, - path: path.to_path_buf(), - }) + Ok(Self { mmap, n, n_overflow, step, index, primary_offset, data_offset, path: path.to_path_buf() }) } - /// Returns the path of the compact int vector file. - pub fn path(&self) -> &Path { - &self.path - } + pub fn path(&self) -> &Path { &self.path } + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } - /// Returns the length of the compact int vector. - pub fn len(&self) -> usize { - self.n - } - - /// Returns whether the compact int vector is empty. - pub fn is_empty(&self) -> bool { - self.n == 0 - } - - /// Returns the value at the given slot. pub fn get(&self, slot: usize) -> u32 { match self.mmap[self.primary_offset + slot] { 255 => self.overflow_get(slot), - v => v as u32, + v => v as u32, } } - /// Returns the value at the given slot from the overflow region. fn overflow_get(&self, slot: usize) -> u32 { - let pos_start; - let pos_end; - - if self.step == 0 { - pos_start = 0; - pos_end = self.n_overflow; + let (pos_start, pos_end) = if self.step == 0 { + (0, self.n_overflow) } else { - let i = self - .index - .partition_point(|&(s, _)| s <= slot) - .saturating_sub(1); - pos_start = self.index[i].1; - pos_end = if i + 1 < self.index.len() { - self.index[i + 1].1 - } else { - self.n_overflow - }; - } - + let i = self.index.partition_point(|&(s, _)| s <= slot).saturating_sub(1); + let start = self.index[i].1; + let end = if i + 1 < self.index.len() { self.index[i + 1].1 } else { self.n_overflow }; + (start, end) + }; let mut lo = pos_start; let mut hi = pos_end; while lo < hi { let mid = lo + (hi - lo) / 2; match self.data_slot(mid).cmp(&slot) { - std::cmp::Ordering::Equal => return self.data_value(mid), - std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Equal => return self.data_value(mid), + std::cmp::Ordering::Less => lo = mid + 1, std::cmp::Ordering::Greater => hi = mid, } } @@ -119,144 +80,91 @@ impl PersistentCompactIntVec { } #[inline] - /// Returns the slot at the given index in the overflow region. fn data_slot(&self, i: usize) -> usize { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE; u64::from_le_bytes(self.mmap[off..off + 8].try_into().unwrap()) as usize } #[inline] - /// Returns the value at the given index in the overflow region. fn data_value(&self, i: usize) -> u32 { let off = self.data_offset + i * OVERFLOW_ENTRY_SIZE + 8; u32::from_le_bytes(self.mmap[off..off + 4].try_into().unwrap()) } - #[inline] pub fn sum(&self) -> u64 { - self.iter().map(|v| v as u64).sum() + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + byte_sum(primary, (0..self.n_overflow).map(|i| self.data_value(i))) } - #[inline] pub fn count_nonzero(&self) -> u64 { - self.iter().filter(|&v| v > 0).count() as u64 + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + byte_count_nonzero(primary) } - #[inline] - /// Returns the Bray-Curtis distance between two compact int vectors. + /// Lightweight zero-copy view — primary and overflow point into the mmap. + pub fn view(&self) -> IntSliceView<'_> { + let primary = &self.mmap[self.primary_offset..self.primary_offset + self.n]; + let overflow_raw = &self.mmap[self.data_offset..self.data_offset + self.n_overflow * OVERFLOW_ENTRY_SIZE]; + IntSliceView::new(primary, overflow_raw, self.n_overflow, self.n) + } + + pub fn iter(&self) -> Iter<'_> { + Iter { pciv: self, slot: 0, overflow_pos: 0 } + } + + // ── Distance methods ────────────────────────────────────────────────────── + pub fn bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { let sum_min = self.partial_bray_dist(other); let denom = self.sum() + other.sum(); - if denom == 0 { - return 0.0; - } - 1.0 - 2.0 * sum_min as f64 / denom as f64 + if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 } } - /// Returns `Σ_slot min(self[slot], other[slot])` — the additive numerator of Bray-Curtis. - /// The denominator `sum_a + sum_b` is obtained from `self.sum() + other.sum()`. pub fn partial_bray_dist(&self, other: &PersistentCompactIntVec) -> u64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) - .map(|(a, b)| a.min(b) as u64) - .sum() + self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum() } - /// Returns the relative frequency Bray-Curtis distance between two compact int vectors. - /// - /// This is a variant of [`bray_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_bray_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - let sum_min = self.partial_relfreq_bray_dist(other, sum_a, sum_b); - 1.0 - sum_min + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + 1.0 - self.partial_relfreq_bray_dist(other, sa, sb) } - /// Returns the partial relative frequency Bray-Curtis distance between two compact int vectors. - /// - /// This is used internally by [`relfreq_bray_dist`] and to easily compute the relative frequency - /// Bray-Curtis distance over a set of vector pairs. - /// - /// Arguments: - /// - `other`: the other compact int vector to compare with - /// - `sum_a`: the sum of the first vector's counts - /// - `sum_b`: the sum of the second vector's counts - /// - /// Returns the sum of the minimum relative frequencies at each index. - pub fn partial_relfreq_bray_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_relfreq_bray_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - let sum_min: f64 = self - .iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; pa.min(pb) }) - .sum(); - sum_min + .sum() } - /// Returns the euclidean distance between two compact int vectors. pub fn euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { self.partial_euclidean_dist(other).sqrt() } - /// Returns the partial euclidean distance between two compact int vectors. - /// - /// This is used internally by [`euclidean_dist`] and to easily compute the euclidean distance - /// over a set of vector pairs. - /// - /// The result is the sum of the squared differences between corresponding elements of the two - /// vectors. pub fn partial_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) - .map(|(a, b)| { - let d = a as f64 - b as f64; - d * d - }) + self.iter().zip(other.iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) .sum() } - /// Returns the relative frequency euclidean distance between two compact int vectors. - /// - /// This is a variant of [`euclidean_dist`] that uses relative frequencies instead of raw counts. pub fn relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - self.partial_relfreq_euclidean_dist(other, sum_a, sum_b) - .sqrt() + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt() } - /// Returns the partial relative frequency euclidean distance between two compact int vectors. - /// - /// This is used internally by [`relfreq_euclidean_dist`] and to easily compute the relative frequency - /// euclidean distance over a set of vector pairs. - pub fn partial_relfreq_euclidean_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_relfreq_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { let pa = if sum_a > 0.0 { a as f64 / sum_a } else { 0.0 }; let pb = if sum_b > 0.0 { b as f64 / sum_b } else { 0.0 }; @@ -266,46 +174,19 @@ impl PersistentCompactIntVec { .sum() } - /// Returns the Euclidean distance between two compact int vectors using the Hellinger transform. - /// - /// The Hellinger transform is applied to the raw counts of each vector, and the result is - /// the Euclidean distance between the transformed vectors. The Hellinger transform is defined - /// as the square root of the relative frequencies. pub fn hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); - let sum_a = self.sum() as f64; - let sum_b = other.sum() as f64; - if sum_a == 0.0 && sum_b == 0.0 { - return 0.0; - } - self.partial_hellinger_euclidean_dist(other, sum_a, sum_b) - .sqrt() + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt() } - /// Returns the partial Hellinger Euclidean distance between two compact int vectors. - /// - /// This is used internally by [`hellinger_euclidean_dist`] and to easily compute the Hellinger - /// Euclidean distance over a set of vector pairs. - pub fn partial_hellinger_euclidean_dist( - &self, - other: &PersistentCompactIntVec, - sum_a: f64, - sum_b: f64, - ) -> f64 { + pub fn partial_hellinger_euclidean_dist(&self, other: &PersistentCompactIntVec, sum_a: f64, sum_b: f64) -> f64 { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .map(|(a, b)| { - let pa = if sum_a > 0.0 { - (a as f64 / sum_a).sqrt() - } else { - 0.0 - }; - let pb = if sum_b > 0.0 { - (b as f64 / sum_b).sqrt() - } else { - 0.0 - }; + let pa = if sum_a > 0.0 { (a as f64 / sum_a).sqrt() } else { 0.0 }; + let pb = if sum_b > 0.0 { (b as f64 / sum_b).sqrt() } else { 0.0 }; let d = pa - pb; d * d }) @@ -317,22 +198,13 @@ impl PersistentCompactIntVec { } pub fn threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> f64 { - assert_eq!(self.n, other.len(), "length mismatch"); let (intersection, union) = self.partial_threshold_jaccard_dist(other, threshold); - if union == 0 { - return 0.0; - } - 1.0 - intersection as f64 / union as f64 + if union == 0 { 0.0 } else { 1.0 - intersection as f64 / union as f64 } } - pub fn partial_threshold_jaccard_dist( - &self, - other: &PersistentCompactIntVec, - threshold: u32, - ) -> (u64, u64) { + pub fn partial_threshold_jaccard_dist(&self, other: &PersistentCompactIntVec, threshold: u32) -> (u64, u64) { assert_eq!(self.n, other.len(), "length mismatch"); - self.iter() - .zip(other.iter()) + self.iter().zip(other.iter()) .fold((0u64, 0u64), |(inter, uni), (a, b)| { let ap = a >= threshold; let bp = b >= threshold; @@ -343,23 +215,12 @@ impl PersistentCompactIntVec { pub fn jaccard_dist(&self, other: &PersistentCompactIntVec) -> f64 { self.threshold_jaccard_dist(other, 1) } - - pub fn iter(&self) -> Iter<'_> { - Iter { - pciv: self, - slot: 0, - overflow_pos: 0, - } - } } impl<'a> IntoIterator for &'a PersistentCompactIntVec { type Item = u32; type IntoIter = Iter<'a>; - - fn into_iter(self) -> Iter<'a> { - self.iter() - } + fn into_iter(self) -> Iter<'a> { self.iter() } } pub struct Iter<'a> { @@ -374,9 +235,7 @@ impl Iterator for Iter<'_> { type Item = u32; fn next(&mut self) -> Option { - if self.slot >= self.pciv.n { - return None; - } + if self.slot >= self.pciv.n { return None; } let v = self.pciv.mmap[self.pciv.primary_offset + self.slot]; self.slot += 1; if v < 255 { diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs new file mode 100644 index 0000000..df1d436 --- /dev/null +++ b/src/obicompactvec/src/tempbitvec.rs @@ -0,0 +1,111 @@ +use std::io; +use std::path::Path; + +use tempfile::TempDir; + +use crate::bitvec::{PersistentBitVec, PersistentBitVecBuilder}; +use crate::views::{BitSliceIter, BitSliceView, IntSliceView}; + +// ── TempBitVec — frozen read-only, auto-deleted on drop ────────────────────── + +pub struct TempBitVec { + vec: PersistentBitVec, + // Dropped after `vec` (field order), so the mmap is released before the + // temp directory is deleted. + _temp: TempDir, +} + +impl TempBitVec { + pub fn make_persistent(&self, path: &Path) -> io::Result { + std::fs::copy(self.vec.path(), path)?; + PersistentBitVec::open(path) + } + + pub fn len(&self) -> usize { + self.vec.len() + } + pub fn is_empty(&self) -> bool { + self.vec.is_empty() + } + pub fn get(&self, slot: usize) -> bool { + self.vec.get(slot) + } + pub fn count_ones(&self) -> u64 { + self.vec.count_ones() + } + pub fn view(&self) -> BitSliceView<'_> { + self.vec.view() + } + pub fn iter(&self) -> BitSliceIter<'_> { + self.view().iter() + } +} + +// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ──────────────── + +pub struct TempBitVecBuilder { + builder: PersistentBitVecBuilder, + temp: TempDir, +} + +impl TempBitVecBuilder { + pub fn new(n: usize) -> io::Result { + let temp = TempDir::new()?; + let path = temp.path().join("data.pbiv"); + let builder = PersistentBitVecBuilder::new(n, &path)?; + Ok(Self { builder, temp }) + } + + pub fn new_ones(n: usize) -> io::Result { + let temp = TempDir::new()?; + let path = temp.path().join("data.pbiv"); + let builder = PersistentBitVecBuilder::new_ones(n, &path)?; + Ok(Self { builder, temp }) + } + + pub fn freeze(self) -> io::Result { + let Self { builder, temp } = self; + let vec = builder.finish()?; + Ok(TempBitVec { vec, _temp: temp }) + } + + pub fn set(&mut self, slot: usize, value: bool) { + self.builder.set(slot, value); + } + + pub fn view(&self) -> BitSliceView<'_> { + self.builder.view() + } + + pub fn or(&mut self, other: BitSliceView<'_>) { + self.builder.or(other); + } + + pub fn and(&mut self, other: BitSliceView<'_>) { + self.builder.and(other); + } + + pub fn xor(&mut self, other: BitSliceView<'_>) { + self.builder.xor(other); + } + + pub fn not(&mut self) { + self.builder.not(); + } + + pub fn copy_from(&mut self, src: BitSliceView<'_>) { + self.builder.copy_from(src); + } + + pub fn or_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.or_where(col, pred); + } + + pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.and_where(col, pred); + } + + pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.xor_where(col, pred); + } +} diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs new file mode 100644 index 0000000..b0b3492 --- /dev/null +++ b/src/obicompactvec/src/tempintvec.rs @@ -0,0 +1,89 @@ +use std::io; +use std::path::Path; + +use tempfile::TempDir; + +use crate::builder::PersistentCompactIntVecBuilder; +use crate::reader::PersistentCompactIntVec; +use crate::views::{BitSliceView, IntSliceView}; + +// ── TempCompactIntVec — frozen read-only, auto-deleted on drop ──────────────── + +pub struct TempCompactIntVec { + vec: PersistentCompactIntVec, + // Dropped after `vec` (field order), so the mmap is released before the + // temp directory is deleted. + _temp: TempDir, +} + +impl TempCompactIntVec { + pub fn make_persistent(&self, path: &Path) -> io::Result { + std::fs::copy(self.vec.path(), path)?; + PersistentCompactIntVec::open(path) + } + + pub fn len(&self) -> usize { self.vec.len() } + pub fn is_empty(&self) -> bool { self.vec.is_empty() } + pub fn get(&self, slot: usize) -> u32 { self.vec.get(slot) } + pub fn sum(&self) -> u64 { self.vec.sum() } + pub fn view(&self) -> IntSliceView<'_> { self.vec.view() } + pub fn iter(&self) -> crate::reader::Iter<'_> { self.vec.iter() } +} + +// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ── + +pub struct TempCompactIntVecBuilder { + builder: PersistentCompactIntVecBuilder, + temp: TempDir, +} + +impl TempCompactIntVecBuilder { + pub fn new(n: usize) -> io::Result { + let temp = TempDir::new()?; + let path = temp.path().join("data.pciv"); + let builder = PersistentCompactIntVecBuilder::new(n, &path)?; + Ok(Self { builder, temp }) + } + + pub fn freeze(self) -> io::Result { + let Self { builder, temp } = self; + let vec = builder.finish()?; + Ok(TempCompactIntVec { vec, _temp: temp }) + } + + pub fn n(&self) -> usize { self.builder.len() } + + pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } + pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } + + pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } + pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } + + pub fn inc_present(&mut self, col: BitSliceView<'_>) { + self.builder.inc_present(col); + } + + pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) { + self.builder.inc_present_fast(col); + } + + pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.inc_predicate(col, pred); + } + + pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + self.builder.inc_predicate_fast(col, pred); + } + + pub fn add(&mut self, other: IntSliceView<'_>) { + self.builder.add(other); + } + + pub fn mask_with(&mut self, mask: BitSliceView<'_>) { + self.builder.mask_with(mask); + } + + pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); } + pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); } + pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); } +} diff --git a/src/obicompactvec/src/tests/bitmatrix.rs b/src/obicompactvec/src/tests/bitmatrix.rs index 741a07c..7600ac3 100644 --- a/src/obicompactvec/src/tests/bitmatrix.rs +++ b/src/obicompactvec/src/tests/bitmatrix.rs @@ -1,6 +1,6 @@ use tempfile::tempdir; -use crate::{PersistentBitMatrix, PersistentBitMatrixBuilder}; +use crate::{pack_bit_matrix, PersistentBitMatrix, PersistentBitMatrixBuilder}; use crate::traits::BitPartials; fn make_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { @@ -203,3 +203,57 @@ fn partial_hamming_matches_hamming() { let full = m.hamming_dist_matrix(); assert_eq!(partial, full); } + +// ── col_view on Packed ──────────────────────────────────────────────────────── + +#[test] +fn col_view_packed_values() { + let (dir, _) = make_matrix(&[ + &[true, false, true, true], + &[false, true, false, true], + ]); + pack_bit_matrix(&dir.path().join("presence")).unwrap(); + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + + // col 0: [T, F, T, T] + let v0 = m.col_view(0); + assert_eq!(v0.len(), 4); + assert_eq!(v0.get(0), true); + assert_eq!(v0.get(1), false); + assert_eq!(v0.get(2), true); + assert_eq!(v0.get(3), true); + assert_eq!(v0.count_ones(), 3); + + // col 1: [F, T, F, T] + let v1 = m.col_view(1); + assert_eq!(v1.get(0), false); + assert_eq!(v1.get(1), true); + assert_eq!(v1.get(2), false); + assert_eq!(v1.get(3), true); + assert_eq!(v1.count_ones(), 2); +} + +#[test] +fn col_view_packed_matches_columnar() { + let data: &[&[bool]] = &[ + &[true, false, true, false, true, true, false, true], + &[false, false, true, true, false, true, true, false], + &[true, true, true, false, false, false, true, true], + ]; + let (dir_col, m_col) = make_matrix(data); + let (dir_pack, _) = make_matrix(data); + pack_bit_matrix(&dir_pack.path().join("presence")).unwrap(); + let m_pack = PersistentBitMatrix::open(dir_pack.path()).unwrap(); + + for c in 0..data.len() { + let col_ref = m_col.col(c); + let col_view = m_pack.col_view(c); + assert_eq!(col_view.len(), col_ref.len(), "col={c} len"); + for s in 0..col_ref.len() { + assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}"); + } + assert_eq!(col_view.count_ones(), col_ref.count_ones(), "col={c} count_ones"); + assert_eq!(col_view.words(), col_ref.words(), "col={c} words"); + } + drop(dir_col); +} diff --git a/src/obicompactvec/src/tests/bitvec.rs b/src/obicompactvec/src/tests/bitvec.rs index 6b20568..4669489 100644 --- a/src/obicompactvec/src/tests/bitvec.rs +++ b/src/obicompactvec/src/tests/bitvec.rs @@ -77,7 +77,7 @@ fn op_and() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.and(&rb); + b.and(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![true, false, false, false]); @@ -90,7 +90,7 @@ fn op_or() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.or(&rb); + b.or(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![true, true, true, false]); @@ -103,7 +103,7 @@ fn op_xor() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pbiv"); let mut b = PersistentBitVecBuilder::build_from(&ra, &path).unwrap(); - b.xor(&rb); + b.xor(rb.view()); b.close().unwrap(); let r = PersistentBitVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![false, true, true, false]); diff --git a/src/obicompactvec/src/tests/colgroup.rs b/src/obicompactvec/src/tests/colgroup.rs new file mode 100644 index 0000000..d1c7cf1 --- /dev/null +++ b/src/obicompactvec/src/tests/colgroup.rs @@ -0,0 +1,223 @@ +use tempfile::tempdir; + +use crate::{ + ColGroup, MatrixGroupOps, + PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, +}; +use crate::{PersistentBitVecBuilder, PersistentCompactIntVec, PersistentCompactIntVecBuilder}; + +// ── helpers ─────────────────────────────────────────────────────────────────── + +fn make_int_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let mut b = PersistentCompactIntMatrixBuilder::new(n, &dir.path().join("counts")).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +fn make_bit_matrix(cols: &[&[bool]]) -> (tempfile::TempDir, PersistentBitMatrix) { + let n = cols.first().map_or(0, |c| c.len()); + let dir = tempdir().unwrap(); + let presence = dir.path().join("presence"); + let mut b = PersistentBitMatrixBuilder::new(n, &presence).unwrap(); + for &col in cols { + let mut cb = b.add_col().unwrap(); + for (slot, &v) in col.iter().enumerate() { cb.set(slot, v); } + cb.close().unwrap(); + } + b.close().unwrap(); + let m = PersistentBitMatrix::open(dir.path()).unwrap(); + (dir, m) +} + +// ── IntMatrix: partial_group_sum ────────────────────────────────────────────── + +#[test] +fn int_partial_group_sum_basic() { + // col0=[1,2,3], col1=[10,20,30], col2=[100,0,5] + // group {0,2}: sum = [101, 2, 8] + let (_d, m) = make_int_matrix(&[&[1, 2, 3], &[10, 20, 30], &[100, 0, 5]]); + let g = ColGroup::new("g", vec![0, 2]); + let result = m.partial_group_sum(&g).unwrap(); + assert_eq!(result.get(0), 101); + assert_eq!(result.get(1), 2); + assert_eq!(result.get(2), 8); +} + +#[test] +fn int_partial_group_sum_with_overflow() { + // col0=[300,0], col1=[200,400]: group {0,1}: sum=[500, 400] + let (_d, m) = make_int_matrix(&[&[300, 0], &[200, 400]]); + let g = ColGroup::new("g", vec![0, 1]); + let result = m.partial_group_sum(&g).unwrap(); + assert_eq!(result.get(0), 500); + assert_eq!(result.get(1), 400); + assert_eq!(result.sum(), 900); +} + +// ── IntMatrix: partial_group_presence_count ─────────────────────────────────── + +#[test] +fn int_partial_group_presence_count() { + // col0=[5,1,0,3], col1=[2,0,4,3], col2=[0,3,1,0] + // threshold=2: col0: [T,F,F,T], col1: [T,F,T,T], col2: [F,T,F,F] + // group {0,1,2}: counts = [2, 1, 1, 2] + let (_d, m) = make_int_matrix(&[&[5, 1, 0, 3], &[2, 0, 4, 3], &[0, 3, 1, 0]]); + let g = ColGroup::new("g", vec![0, 1, 2]); + let result = m.partial_group_presence_count(&g, 2).unwrap(); + assert_eq!(result.get(0), 2); + assert_eq!(result.get(1), 1); + assert_eq!(result.get(2), 1); + assert_eq!(result.get(3), 2); +} + +#[test] +fn int_partial_group_presence_count_with_overflow() { + // col0=[300,0,10], col1=[0,400,10], col2=[1,1,10] + // threshold=5: col0: [T,F,T], col1: [F,T,T], col2: [F,F,T] + // group {0,1,2}: counts = [1, 1, 3] + let (_d, m) = make_int_matrix(&[&[300, 0, 10], &[0, 400, 10], &[1, 1, 10]]); + let g = ColGroup::new("g", vec![0, 1, 2]); + let result = m.partial_group_presence_count(&g, 5).unwrap(); + assert_eq!(result.get(0), 1); + assert_eq!(result.get(1), 1); + assert_eq!(result.get(2), 3); +} + +// ── IntMatrix: partial_group_any ────────────────────────────────────────────── + +#[test] +fn int_partial_group_any() { + // col0=[0,3,0,1], col1=[2,0,0,0], col2=[0,0,5,0] + // threshold=2: col0: [F,T,F,F], col1: [T,F,F,F], col2: [F,F,T,F] + // group {0,1,2}: any = [T, T, T, F] + let (_d, m) = make_int_matrix(&[&[0, 3, 0, 1], &[2, 0, 0, 0], &[0, 0, 5, 0]]); + let g = ColGroup::new("g", vec![0, 1, 2]); + let result = m.partial_group_any(&g, 2).unwrap(); + assert_eq!(result.get(0), true); + assert_eq!(result.get(1), true); + assert_eq!(result.get(2), true); + assert_eq!(result.get(3), false); +} + +// ── IntMatrix: mask_with ────────────────────────────────────────────────────── + +#[test] +fn mask_with_zeros_selected_slots() { + // count vec [10, 20, 30, 40], mask [T, F, T, F] → [10, 0, 30, 0] + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap(); + v.set(0, 10); v.set(1, 20); v.set(2, 30); v.set(3, 40); + let mut mask = PersistentBitVecBuilder::new(4, &dir.path().join("m.pbiv")).unwrap(); + mask.set(0, true); mask.set(2, true); + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 10); + assert_eq!(r.get(1), 0); + assert_eq!(r.get(2), 30); + assert_eq!(r.get(3), 0); +} + +#[test] +fn mask_with_overflow_slot_zeroed() { + // overflow slot (value 500) masked out → removed from overflow, primary=0 + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(3, &dir.path().join("v.pciv")).unwrap(); + v.set(0, 10); v.set(1, 500); v.set(2, 5); + let mut mask = PersistentBitVecBuilder::new(3, &dir.path().join("m.pbiv")).unwrap(); + mask.set(0, true); mask.set(2, true); // slot 1 masked out + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 10); + assert_eq!(r.get(1), 0); + assert_eq!(r.get(2), 5); + let ov: Vec<_> = r.view().overflow_entries().collect(); + assert!(ov.is_empty(), "overflow entry for masked-out slot should be gone"); +} + +#[test] +fn mask_with_all_ones_is_noop() { + let dir = tempdir().unwrap(); + let mut v = PersistentCompactIntVecBuilder::new(4, &dir.path().join("v.pciv")).unwrap(); + v.set(0, 300); v.set(1, 1); v.set(2, 0); v.set(3, 42); + let mask = PersistentBitVecBuilder::new_ones(4, &dir.path().join("m.pbiv")).unwrap(); + v.mask_with(mask.view()); + v.close().unwrap(); + let r = PersistentCompactIntVec::open(&dir.path().join("v.pciv")).unwrap(); + assert_eq!(r.get(0), 300); + assert_eq!(r.get(1), 1); + assert_eq!(r.get(2), 0); + assert_eq!(r.get(3), 42); +} + +// ── BitMatrix: partial_group_presence_count ─────────────────────────────────── + +#[test] +fn bit_partial_group_presence_count() { + // col0=[T,F,T,F], col1=[T,T,F,F], col2=[F,T,T,F] + // group {0,1,2}: counts = [2, 2, 2, 0] + let (_d, m) = make_bit_matrix(&[ + &[true, false, true, false], + &[true, true, false, false], + &[false,true, true, false], + ]); + let g = ColGroup::new("g", vec![0, 1, 2]); + let result = m.partial_group_presence_count(&g, 1).unwrap(); + assert_eq!(result.get(0), 2); + assert_eq!(result.get(1), 2); + assert_eq!(result.get(2), 2); + assert_eq!(result.get(3), 0); +} + +// ── BitMatrix: partial_group_any ────────────────────────────────────────────── + +#[test] +fn bit_partial_group_any() { + // col0=[T,F,F], col1=[F,F,T], group {0,1}: any = [T, F, T] + let (_d, m) = make_bit_matrix(&[ + &[true, false, false], + &[false, false, true], + ]); + let g = ColGroup::new("g", vec![0, 1]); + let result = m.partial_group_any(&g, 1).unwrap(); + assert_eq!(result.get(0), true); + assert_eq!(result.get(1), false); + assert_eq!(result.get(2), true); +} + +// ── Composition: partial results are additive ───────────────────────────────── + +#[test] +fn int_presence_count_additive_across_split() { + // Simulate two partitions (different kmer ranges) whose counts should add. + // Global data for col0: [5,1,0,3,2], col1: [2,0,4,3,1] — threshold=2 + // Split: partition A = slots 0..2, partition B = slots 2..5 + let data_a: &[&[u32]] = &[&[5, 1], &[2, 0]]; + let data_b: &[&[u32]] = &[&[0, 3, 2], &[4, 3, 1]]; + let (_da, ma) = make_int_matrix(data_a); + let (_db, mb) = make_int_matrix(data_b); + let g = ColGroup::new("g", vec![0, 1]); + + let pa = ma.partial_group_presence_count(&g, 2).unwrap(); + let pb = mb.partial_group_presence_count(&g, 2).unwrap(); + + // Concatenate by adding (disjoint kmer ranges — here we just verify + // individual results match the expected per-partition counts). + // partition A: col0=[5≥2,1<2]=[T,F], col1=[2≥2,0<2]=[T,F] → [2, 0] + assert_eq!(pa.get(0), 2); + assert_eq!(pa.get(1), 0); + // partition B: col0=[0<2,3≥2,2≥2]=[F,T,T], col1=[4≥2,3≥2,1<2]=[T,T,F] → [1, 2, 1] + assert_eq!(pb.get(0), 1); + assert_eq!(pb.get(1), 2); + assert_eq!(pb.get(2), 1); +} diff --git a/src/obicompactvec/src/tests/intmatrix.rs b/src/obicompactvec/src/tests/intmatrix.rs index c4c0a98..9abd7b5 100644 --- a/src/obicompactvec/src/tests/intmatrix.rs +++ b/src/obicompactvec/src/tests/intmatrix.rs @@ -1,6 +1,6 @@ use tempfile::tempdir; -use crate::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; +use crate::{pack_compact_int_matrix, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder}; use crate::traits::CountPartials; fn make_matrix(cols: &[&[u32]]) -> (tempfile::TempDir, PersistentCompactIntMatrix) { @@ -243,6 +243,61 @@ fn partial_hellinger_matches_full() { } } +#[test] +fn col_view_packed_values() { + // Build Columnar with overflow values (≥ 255), pack, reopen as Packed, exercise col_view(). + let (dir, _col) = make_matrix(&[&[10, 300, 500], &[200, 50, 1000]]); + pack_compact_int_matrix(&dir.path().join("counts")).unwrap(); + let m = PersistentCompactIntMatrix::open(dir.path()).unwrap(); + + // col 0: [10, 300, 500] — two overflow slots + let v0 = m.col_view(0); + assert_eq!(v0.get(0), 10); + assert_eq!(v0.get(1), 300); + assert_eq!(v0.get(2), 500); + assert_eq!(v0.sum(), 810); + assert_eq!(v0.count_nonzero(), 3); + let mut ov0: Vec<(usize, u32)> = v0.overflow_entries().collect(); + ov0.sort_unstable_by_key(|&(s, _)| s); + assert_eq!(ov0, vec![(1, 300), (2, 500)]); + + // col 1: [200, 50, 1000] — one overflow slot + let v1 = m.col_view(1); + assert_eq!(v1.get(0), 200); + assert_eq!(v1.get(1), 50); + assert_eq!(v1.get(2), 1000); + let mut ov1: Vec<(usize, u32)> = v1.overflow_entries().collect(); + ov1.sort_unstable_by_key(|&(s, _)| s); + assert_eq!(ov1, vec![(2, 1000)]); +} + +#[test] +fn col_view_packed_matches_columnar() { + // Same data, compare col_view() on Packed against col() on Columnar slot-by-slot. + let data: &[&[u32]] = &[&[0, 255, 1, 300, 128], &[500, 3, 0, 700, 42]]; + let (dir_col, m_col) = make_matrix(data); + // Re-build in a separate dir so we can pack without touching m_col's files. + let (dir_pack, _) = make_matrix(data); + pack_compact_int_matrix(&dir_pack.path().join("counts")).unwrap(); + let m_pack = PersistentCompactIntMatrix::open(dir_pack.path()).unwrap(); + + for c in 0..data.len() { + let col_ref = m_col.col(c); + let col_view = m_pack.col_view(c); + assert_eq!(col_view.len(), col_ref.len()); + for s in 0..col_ref.len() { + assert_eq!(col_view.get(s), col_ref.get(s), "col={c} slot={s}"); + } + assert_eq!(col_view.sum(), col_ref.sum(), "col={c} sum"); + let mut ov_view: Vec<(usize, u32)> = col_view.overflow_entries().collect(); + let mut ov_ref: Vec<(usize, u32)> = col_ref.view().overflow_entries().collect(); + ov_view.sort_unstable_by_key(|&(s, _)| s); + ov_ref.sort_unstable_by_key(|&(s, _)| s); + assert_eq!(ov_view, ov_ref, "col={c} overflow_entries"); + } + drop(dir_col); +} + #[test] fn partial_relfreq_bray_additive_across_split() { // Split rows [1,2,3,4,5] between two matrices; partial sums should add up. diff --git a/src/obicompactvec/src/tests/mod.rs b/src/obicompactvec/src/tests/mod.rs index 4d2d9ad..31f630e 100644 --- a/src/obicompactvec/src/tests/mod.rs +++ b/src/obicompactvec/src/tests/mod.rs @@ -1,5 +1,6 @@ mod bitmatrix; mod bitvec; +mod colgroup; mod intmatrix; use tempfile::tempdir; @@ -169,7 +170,7 @@ fn combine_min() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.min(&rb); + b.min(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![10, 100, 0, 800]); @@ -182,7 +183,7 @@ fn combine_max() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.max(&rb); + b.max(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![20, 300, 500, 1000]); @@ -195,7 +196,7 @@ fn combine_add() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.add(&rb); + b.add(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![30, 300, 5, 101]); @@ -220,7 +221,7 @@ fn combine_diff() { let dir = tempdir().unwrap(); let path = dir.path().join("out.pciv"); let mut b = PersistentCompactIntVecBuilder::build_from(&ra, &path).unwrap(); - b.diff(&rb); + b.diff(rb.view()); b.close().unwrap(); let r = PersistentCompactIntVec::open(&path).unwrap(); assert_eq!(r.iter().collect::>(), vec![10, 700, 0, 0]); diff --git a/src/obicompactvec/src/traits.rs b/src/obicompactvec/src/traits.rs index b61e69b..cc52bc1 100644 --- a/src/obicompactvec/src/traits.rs +++ b/src/obicompactvec/src/traits.rs @@ -1,6 +1,6 @@ use ndarray::{Array1, Array2}; -/// Column-level weight statistic — total count or presence count per column. +// ── Column-level weight statistic — total count or presence count per column. /// Additive across layers and partitions; used as denominator in normalised distances. /// /// `partial_kmer_counts` returns the number of **distinct k-mers** present per diff --git a/src/obicompactvec/src/views.rs b/src/obicompactvec/src/views.rs new file mode 100644 index 0000000..85e4165 --- /dev/null +++ b/src/obicompactvec/src/views.rs @@ -0,0 +1,278 @@ +use crate::format::{byte_count_nonzero, byte_sum, parse_overflow_entry}; + +// ── BitSliceView ────────────────────────────────────────────────────────────── + +/// Lightweight, copy-able read-only view over a u64 word array. +/// Bit `i` is in `words[i >> 6]` at position `i & 63`. Padding bits are zero. +#[derive(Clone, Copy)] +pub struct BitSliceView<'a> { + pub(crate) words: &'a [u64], + pub(crate) n: usize, +} + +impl<'a> BitSliceView<'a> { + #[inline] + pub fn new(words: &'a [u64], n: usize) -> Self { Self { words, n } } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + pub fn words(&self) -> &'a [u64] { self.words } + + #[inline] + pub fn get(&self, slot: usize) -> bool { + (self.words[slot >> 6] >> (slot & 63)) & 1 != 0 + } + + pub fn count_ones(&self) -> u64 { + self.words.iter().map(|w| w.count_ones() as u64).sum() + } + pub fn count_zeros(&self) -> u64 { self.n as u64 - self.count_ones() } + + pub fn iter(&self) -> BitSliceIter<'a> { + BitSliceIter { words: self.words, slot: 0, n: self.n } + } + + pub fn partial_jaccard_dist(self, other: BitSliceView<'_>) -> (u64, u64) { + assert_eq!(self.n, other.n, "BitSliceView length mismatch"); + self.words.iter().zip(other.words) + .fold((0u64, 0u64), |(i, u), (&a, &b)| { + (i + (a & b).count_ones() as u64, u + (a | b).count_ones() as u64) + }) + } + + pub fn jaccard_dist(self, other: BitSliceView<'_>) -> f64 { + let (inter, union) = self.partial_jaccard_dist(other); + if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } + } + + pub fn hamming_dist(self, other: BitSliceView<'_>) -> u64 { + assert_eq!(self.n, other.n, "BitSliceView length mismatch"); + self.words.iter().zip(other.words) + .map(|(&a, &b)| (a ^ b).count_ones() as u64) + .sum() + } +} + +// ── BitSliceIter ────────────────────────────────────────────────────────────── + +pub struct BitSliceIter<'a> { + words: &'a [u64], + slot: usize, + n: usize, +} + +impl Iterator for BitSliceIter<'_> { + type Item = bool; + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = (self.words[self.slot >> 6] >> (self.slot & 63)) & 1 != 0; + self.slot += 1; + Some(v) + } + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} +impl ExactSizeIterator for BitSliceIter<'_> {} + +// ── IntSliceView ────────────────────────────────────────────────────────────── + +/// Lightweight, copy-able read-only view over a compact-int primary array plus +/// its sorted raw overflow bytes. Zero-copy: all data lives in the caller's mmap. +#[derive(Clone, Copy)] +pub struct IntSliceView<'a> { + pub(crate) primary: &'a [u8], + pub(crate) overflow_raw: &'a [u8], // n_overflow × OVERFLOW_ENTRY_SIZE bytes, sorted by slot + pub(crate) n_overflow: usize, + pub(crate) n: usize, +} + +impl<'a> IntSliceView<'a> { + #[inline] + pub fn new(primary: &'a [u8], overflow_raw: &'a [u8], n_overflow: usize, n: usize) -> Self { + Self { primary, overflow_raw, n_overflow, n } + } + + pub fn len(&self) -> usize { self.n } + pub fn is_empty(&self) -> bool { self.n == 0 } + pub fn primary_bytes(&self) -> &'a [u8] { self.primary } + pub fn n_overflow(&self) -> usize { self.n_overflow } + + pub fn overflow_entries(&self) -> impl Iterator + 'a { + let raw = self.overflow_raw; + let n_ov = self.n_overflow; + (0..n_ov).map(move |i| parse_overflow_entry(raw, 0, i)) + } + + /// O(log n_overflow) via binary search (overflow is always sorted by slot). + pub fn get(&self, slot: usize) -> u32 { + let b = self.primary[slot]; + if b < 255 { return b as u32; } + let mut lo = 0usize; + let mut hi = self.n_overflow; + while lo < hi { + let mid = lo + (hi - lo) / 2; + let (s, v) = parse_overflow_entry(self.overflow_raw, 0, mid); + match s.cmp(&slot) { + std::cmp::Ordering::Equal => return v, + std::cmp::Ordering::Less => lo = mid + 1, + std::cmp::Ordering::Greater => hi = mid, + } + } + panic!("slot {slot} marked overflow but not found") + } + + /// Sequential merge scan: yields all n values in slot order. + pub fn iter(&self) -> IntSliceViewIter<'a> { + IntSliceViewIter { + primary: self.primary, + overflow_raw: self.overflow_raw, + slot: 0, + overflow_pos: 0, + n: self.n, + } + } + + pub fn sum(&self) -> u64 { + byte_sum(self.primary, self.overflow_entries().map(|(_, v)| v)) + } + + pub fn count_nonzero(&self) -> u64 { + byte_count_nonzero(self.primary) + } + + // ── Distance methods ────────────────────────────────────────────────────── + + pub fn partial_bray_dist(self, other: IntSliceView<'_>) -> u64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()).map(|(a, b)| a.min(b) as u64).sum() + } + + pub fn bray_dist(self, other: IntSliceView<'_>) -> f64 { + let sum_min = self.partial_bray_dist(other); + let denom = self.sum() + other.sum(); + if denom == 0 { 0.0 } else { 1.0 - 2.0 * sum_min as f64 / denom as f64 } + } + + pub fn partial_relfreq_bray_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 }; + let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 }; + pa.min(pb) + }) + .sum() + } + + pub fn relfreq_bray_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + 1.0 - self.partial_relfreq_bray_dist(other, sa, sb) + } + + pub fn partial_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { let d = a as f64 - b as f64; d * d }) + .sum() + } + + pub fn euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + self.partial_euclidean_dist(other).sqrt() + } + + pub fn partial_relfreq_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { a as f64 / sa } else { 0.0 }; + let pb = if sb > 0.0 { b as f64 / sb } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum() + } + + pub fn relfreq_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_relfreq_euclidean_dist(other, sa, sb).sqrt() + } + + pub fn partial_hellinger_euclidean_dist(self, other: IntSliceView<'_>, sa: f64, sb: f64) -> f64 { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .map(|(a, b)| { + let pa = if sa > 0.0 { (a as f64 / sa).sqrt() } else { 0.0 }; + let pb = if sb > 0.0 { (b as f64 / sb).sqrt() } else { 0.0 }; + let d = pa - pb; + d * d + }) + .sum() + } + + pub fn hellinger_euclidean_dist(self, other: IntSliceView<'_>) -> f64 { + let sa = self.sum() as f64; + let sb = other.sum() as f64; + if sa == 0.0 && sb == 0.0 { return 0.0; } + self.partial_hellinger_euclidean_dist(other, sa, sb).sqrt() + } + + pub fn hellinger_dist(self, other: IntSliceView<'_>) -> f64 { + self.hellinger_euclidean_dist(other) / std::f64::consts::SQRT_2 + } + + pub fn partial_threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> (u64, u64) { + assert_eq!(self.n, other.n, "length mismatch"); + self.iter().zip(other.iter()) + .fold((0u64, 0u64), |(inter, uni), (a, b)| { + let ap = a >= threshold; + let bp = b >= threshold; + (inter + (ap & bp) as u64, uni + (ap | bp) as u64) + }) + } + + pub fn threshold_jaccard_dist(self, other: IntSliceView<'_>, threshold: u32) -> f64 { + let (inter, union) = self.partial_threshold_jaccard_dist(other, threshold); + if union == 0 { 0.0 } else { 1.0 - inter as f64 / union as f64 } + } + + pub fn jaccard_dist(self, other: IntSliceView<'_>) -> f64 { + self.threshold_jaccard_dist(other, 1) + } +} + +// ── IntSliceViewIter ────────────────────────────────────────────────────────── + +pub struct IntSliceViewIter<'a> { + primary: &'a [u8], + overflow_raw: &'a [u8], + slot: usize, + overflow_pos: usize, + n: usize, +} + +impl Iterator for IntSliceViewIter<'_> { + type Item = u32; + fn next(&mut self) -> Option { + if self.slot >= self.n { return None; } + let v = self.primary[self.slot]; + self.slot += 1; + if v < 255 { + Some(v as u32) + } else { + let (_, val) = parse_overflow_entry(self.overflow_raw, 0, self.overflow_pos); + self.overflow_pos += 1; + Some(val) + } + } + fn size_hint(&self) -> (usize, Option) { + let rem = self.n - self.slot; + (rem, Some(rem)) + } +} +impl ExactSizeIterator for IntSliceViewIter<'_> {} diff --git a/src/obidebruinj/src/debruijn.rs b/src/obidebruinj/src/debruijn.rs index 8d300f2..f59f03a 100644 --- a/src/obidebruinj/src/debruijn.rs +++ b/src/obidebruinj/src/debruijn.rs @@ -3,6 +3,7 @@ use crossbeam_channel; use hashbrown::HashMap; use obikseq::k; use obikseq::{CanonicalKmer, Sequence, Unitig}; +#[cfg(not(any(test, feature = "test-utils")))] use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use std::cell::RefCell; use std::fmt; diff --git a/src/obikindex/src/index.rs b/src/obikindex/src/index.rs index 353c39a..f6b0889 100644 --- a/src/obikindex/src/index.rs +++ b/src/obikindex/src/index.rs @@ -204,6 +204,7 @@ impl KmerIndex { let n = self.n_partitions(); let order: Vec = (0..n).collect(); + let pb = progress_bar("pack", n as u64, "partitions"); crate::numa::PartitionRunner::new().run( &order, |i| -> OKIResult<()> { @@ -220,8 +221,10 @@ impl KmerIndex { } Ok(()) }, - |_, _, _| {}, - ) + |_, _, _| { pb.inc(1); }, + )?; + pb.finish_and_clear(); + Ok(()) } /// Write a `layer_meta.json` in any layer directory that is missing one. diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index c637c9b..cbfdaba 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -11,7 +11,7 @@ use obilayeredmap::IndexMode; use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; use crate::meta::{GenomeInfo, IndexMeta}; -use crate::state::IndexState; +use crate::state::{IndexState, SENTINEL_INDEXED}; pub use obikpartitionner::MergeMode; @@ -263,6 +263,8 @@ impl KmerIndex { rep.push(t.stop()); } + fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?; + KmerIndex::open(output) } } diff --git a/src/obikindex/src/rebuild.rs b/src/obikindex/src/rebuild.rs index b1a8b5c..83a416d 100644 --- a/src/obikindex/src/rebuild.rs +++ b/src/obikindex/src/rebuild.rs @@ -98,7 +98,9 @@ impl KmerIndex { fs::File::create(output.join(SENTINEL_INDEXED))?; let idx = KmerIndex::open(output)?; + let t_pack = Stage::start("pack"); idx.pack_matrices()?; + rep.push(t_pack.stop()); Ok(idx) } } diff --git a/src/obikindex/src/select.rs b/src/obikindex/src/select.rs index 1db57bd..a27125b 100644 --- a/src/obikindex/src/select.rs +++ b/src/obikindex/src/select.rs @@ -3,7 +3,7 @@ use std::io; use std::path::Path; use obikpartitionner::{KmerPartition, OutputCol, PARTITIONS_SUBDIR}; -use obisys::{Stage, progress_bar}; +use obisys::{Reporter, Stage, progress_bar}; use tracing::info; use crate::error::{OKIError, OKIResult}; @@ -25,6 +25,7 @@ impl KmerIndex { threshold: u32, output_presence: bool, force: bool, + rep: &mut Reporter, ) -> OKIResult { let output = output.as_ref(); @@ -80,13 +81,14 @@ impl KmerIndex { ).map_err(OKIError::Partition)?; pb.finish_and_clear(); - - let _ = t.stop(); + rep.push(t.stop()); fs::File::create(output.join(SENTINEL_INDEXED))?; let idx = KmerIndex::open(output)?; + let t_pack = Stage::start("pack"); idx.pack_matrices()?; + rep.push(t_pack.stop()); Ok(idx) } @@ -98,6 +100,7 @@ impl KmerIndex { specs: &[OutputCol], threshold: u32, output_presence: bool, + rep: &mut Reporter, ) -> OKIResult<()> { if self.state() != IndexState::Indexed { return Err(OKIError::NotIndexed(self.root_path.clone())); @@ -106,7 +109,6 @@ impl KmerIndex { let n_src_genomes = self.meta.genomes.len(); let n_partitions = self.partition.n_partitions(); - // Open a second handle to the same path so we can borrow src and dst simultaneously. let src_partition = KmerPartition::open_with_config( &self.root_path, self.meta.config.kmer_size, @@ -132,17 +134,17 @@ impl KmerIndex { ).map_err(OKIError::Partition)?; pb.finish_and_clear(); + rep.push(t.stop()); - let _ = t.stop(); - - // Update index.meta with new genome list and with_counts flag. self.meta.config.with_counts = !output_presence; self.meta.genomes = specs.iter() .map(|s| GenomeInfo::new(s.label.clone())) .collect(); self.meta.write(&self.root_path)?; + let t_pack = Stage::start("pack"); self.pack_matrices()?; + rep.push(t_pack.stop()); Ok(()) } } diff --git a/src/obikmer/Cargo.toml b/src/obikmer/Cargo.toml index 2dcfb91..9287bbd 100644 --- a/src/obikmer/Cargo.toml +++ b/src/obikmer/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "obikmer" -version = "0.1.0" +version = "0.1.3" edition = "2024" [[bin]] @@ -19,6 +19,7 @@ obikpartitionner = { path = "../obikpartitionner" } obisys = { path = "../obisys" } obiskio = { path = "../obiskio" } obikindex = { path = "../obikindex" } +obitaxonomy = { path = "../obitaxonomy" } obilayeredmap = { path = "../obilayeredmap" } clap = { version = "4", features = ["derive"] } serde_json = "1" diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs index 04678f0..47baab9 100644 --- a/src/obikmer/src/cmd/predicate.rs +++ b/src/obikmer/src/cmd/predicate.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use clap::Args; use obikindex::GenomeInfo; use obikpartitionner::{GroupQuorumFilter, KmerFilter}; +use obitaxonomy::{TaxPath, TaxPattern}; // ── Operator ────────────────────────────────────────────────────────────────── @@ -49,7 +50,6 @@ impl MetaPred { if values.iter().any(|v| v.is_empty()) { return Err(format!("empty value in predicate: {s}")); } - Ok(Self { key, op, values }) } @@ -70,18 +70,15 @@ impl MetaPred { // ── Path matching ───────────────────────────────────────────────────────────── -/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy. +/// True if the stored taxonomy `value` matches `pattern`. /// -/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary. -/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere. +/// `value` must be a valid `TaxPath` (starts with `taxonomy:/`). +/// `pattern` is a `TaxPattern` query (see `obitaxonomy::TaxPattern` for syntax). +/// Returns `false` if either fails to parse. fn path_matches(value: &str, pattern: &str) -> bool { - if pattern.starts_with('/') { - value == pattern - || (value.starts_with(pattern) - && value[pattern.len()..].starts_with('/')) - } else { - value.split('/').any(|seg| seg == pattern) - } + let Ok(path) = TaxPath::parse(value) else { return false }; + let Ok(pat) = TaxPattern::parse(pattern) else { return false }; + pat.matches(&path) } // ── Three-value group evaluation ────────────────────────────────────────────── diff --git a/src/obikmer/src/cmd/select.rs b/src/obikmer/src/cmd/select.rs index e021b36..35719e8 100644 --- a/src/obikmer/src/cmd/select.rs +++ b/src/obikmer/src/cmd/select.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use clap::{Args, ValueEnum}; use obikindex::{GenomeInfo, KmerIndex}; use obikpartitionner::{AggOp, OutputCol}; +use obisys::Reporter; use tracing::info; use super::predicate::matching_genome_indices; @@ -229,20 +230,24 @@ pub fn run(args: SelectArgs) { if output_presence { "presence" } else { "count" }, ); + let mut rep = Reporter::new(); + if args.in_place { - src.select_in_place(&specs, args.presence_threshold, output_presence) + src.select_in_place(&specs, args.presence_threshold, output_presence, &mut rep) .unwrap_or_else(|e| { eprintln!("select error: {e}"); std::process::exit(1); }); + rep.print(); info!("selected in-place → {}", args.source.display()); } else { let output = args.output.unwrap(); - KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force) + KmerIndex::select(&output, &src, &specs, args.presence_threshold, output_presence, args.force, &mut rep) .unwrap_or_else(|e| { eprintln!("select error: {e}"); std::process::exit(1); }); + rep.print(); info!("selected index → {}", output.display()); } } diff --git a/src/obikmer/src/main.rs b/src/obikmer/src/main.rs index fdcf69c..a0b270b 100644 --- a/src/obikmer/src/main.rs +++ b/src/obikmer/src/main.rs @@ -6,7 +6,7 @@ use clap::{Parser, Subcommand}; use tracing_subscriber::{EnvFilter, fmt}; #[derive(Parser)] -#[command(name = "obikmer", about = "DNA k-mer tools")] +#[command(name = "obikmer", about = "DNA k-mer tools", version)] struct Cli { #[command(subcommand)] command: Commands, diff --git a/src/obikpartitionner/src/filter.rs b/src/obikpartitionner/src/filter.rs index d5c6346..00f3b03 100644 --- a/src/obikpartitionner/src/filter.rs +++ b/src/obikpartitionner/src/filter.rs @@ -1,9 +1,24 @@ +use obicompactvec::FilterMask; + /// Trait for kmer row filters. /// /// `row` contains raw per-genome counts (or 0/1 for presence/absence data). /// `n_genomes` equals `row.len()`. pub trait KmerFilter: Send + Sync { fn passes(&self, row: &[u32], n_genomes: usize) -> bool; + + /// Express this filter as a [`FilterMask`] column-operation expression. + /// + /// Returns `Some(expr)` if the filter can be evaluated solely from matrix + /// column aggregates (no per-kmer row scan needed). Returns `None` if the + /// filter requires row-level inspection. + /// + /// `threshold` semantics in the returned mask use `>= threshold`, matching + /// [`obicompactvec::MatrixGroupOps`]. Implementations must add 1 to any + /// row-level threshold that uses strict `>` comparison. + fn column_mask_expr(&self, _n_genomes: usize) -> Option { + None + } } /// True when `row` passes every filter in `filters`. @@ -29,6 +44,16 @@ impl KmerFilter for MinGenomeFraction { let p = present_count(row, self.threshold); p as f64 / n_genomes as f64 >= self.frac } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + let t = self.threshold.checked_add(1)?; + let min_count = (self.frac * n_genomes as f64).ceil() as usize; + Some(FilterMask::PresenceGeq { + indices: (0..n_genomes).collect(), + threshold: t, + min_count, + }) + } } /// At most `frac` fraction of genomes contain this kmer (count > `threshold`). @@ -42,6 +67,16 @@ impl KmerFilter for MaxGenomeFraction { let p = present_count(row, self.threshold); p as f64 / n_genomes as f64 <= self.frac } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + let t = self.threshold.checked_add(1)?; + let max_count = (self.frac * n_genomes as f64).floor() as usize; + Some(FilterMask::PresenceLeq { + indices: (0..n_genomes).collect(), + threshold: t, + max_count, + }) + } } /// At least `count` genomes contain this kmer (count > `threshold`). @@ -54,6 +89,15 @@ impl KmerFilter for MinGenomeCount { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { present_count(row, self.threshold) >= self.count } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + let t = self.threshold.checked_add(1)?; + Some(FilterMask::PresenceGeq { + indices: (0..n_genomes).collect(), + threshold: t, + min_count: self.count, + }) + } } /// At most `count` genomes contain this kmer (count > `threshold`). @@ -66,6 +110,15 @@ impl KmerFilter for MaxGenomeCount { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { present_count(row, self.threshold) <= self.count } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + let t = self.threshold.checked_add(1)?; + Some(FilterMask::PresenceLeq { + indices: (0..n_genomes).collect(), + threshold: t, + max_count: self.count, + }) + } } // ── Total-count filters (count indexes only) ─────────────────────────────────── @@ -79,6 +132,13 @@ impl KmerFilter for MinTotalCount { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { row.iter().sum::() >= self.total } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + Some(FilterMask::SumGeq { + indices: (0..n_genomes).collect(), + min_sum: self.total, + }) + } } /// Sum of counts across all genomes <= `total`. @@ -90,6 +150,13 @@ impl KmerFilter for MaxTotalCount { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { row.iter().sum::() <= self.total } + + fn column_mask_expr(&self, n_genomes: usize) -> Option { + Some(FilterMask::SumLeq { + indices: (0..n_genomes).collect(), + max_sum: self.total, + }) + } } // ── Group-based quorum filter ───────────────────────────────────────────────── @@ -113,6 +180,37 @@ pub struct GroupQuorumFilter { pub max_outgroup_frac: f64, } +impl GroupQuorumFilter { + // Build PresenceGeq/PresenceLeq constraints for one group (ingroup or outgroup). + fn group_mask_parts( + indices: &[usize], + threshold: u32, + min_count: usize, + max_count: usize, + min_frac: f64, + max_frac: f64, + parts: &mut Vec, + ) { + let n = indices.len(); + let geq = min_count.max((min_frac * n as f64).ceil() as usize); + if geq > 0 { + parts.push(FilterMask::PresenceGeq { + indices: indices.to_vec(), + threshold, + min_count: geq, + }); + } + let leq = max_count.min((max_frac * n as f64).floor() as usize); + if leq < n { + parts.push(FilterMask::PresenceLeq { + indices: indices.to_vec(), + threshold, + max_count: leq, + }); + } + } +} + impl KmerFilter for GroupQuorumFilter { fn passes(&self, row: &[u32], _n_genomes: usize) -> bool { if !self.ingroup_idx.is_empty() { @@ -139,4 +237,26 @@ impl KmerFilter for GroupQuorumFilter { } true } + + fn column_mask_expr(&self, _n_genomes: usize) -> Option { + let t = self.threshold.checked_add(1)?; + let mut parts: Vec = Vec::new(); + if !self.ingroup_idx.is_empty() { + Self::group_mask_parts( + &self.ingroup_idx, t, + self.min_count, self.max_count, + self.min_frac, self.max_frac, + &mut parts, + ); + } + if !self.outgroup_idx.is_empty() { + Self::group_mask_parts( + &self.outgroup_idx, t, + self.min_outgroup_count, self.max_outgroup_count, + self.min_outgroup_frac, self.max_outgroup_frac, + &mut parts, + ); + } + Some(FilterMask::And(parts)) + } } diff --git a/src/obikpartitionner/src/merge_layer.rs b/src/obikpartitionner/src/merge_layer.rs index 0701b6d..32750af 100644 --- a/src/obikpartitionner/src/merge_layer.rs +++ b/src/obikpartitionner/src/merge_layer.rs @@ -10,6 +10,7 @@ use obipipeline::{ }; use obicompactvec::{ + MatrixGroupOps, PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, }; @@ -78,6 +79,41 @@ impl SrcLayerData { } buf } + + pub(crate) fn n_slots(&self) -> usize { + match self { + SrcLayerData::Presence(_, mat) => mat.n(), + SrcLayerData::Count(_, mat) => mat.n(), + } + } + + /// MPHF lookup: returns the slot index for `kmer` (kmer must be in the domain). + #[inline] + pub(crate) fn slot(&self, kmer: CanonicalKmer) -> usize { + match self { + SrcLayerData::Presence(mphf, _) => mphf.index(kmer), + SrcLayerData::Count(mphf, _) => mphf.index(kmer), + } + } + + /// Row lookup by slot index, bypassing the MPHF. + #[inline] + pub(crate) fn fill_row_by_slot(&self, slot: usize, n_genomes: usize) -> Vec { + let mut buf = vec![0u32; n_genomes]; + match self { + SrcLayerData::Presence(_, mat) => mat.fill_row(slot, &mut buf), + SrcLayerData::Count(_, mat) => mat.fill_row(slot, &mut buf), + } + buf + } + + /// Call `f` with a reference to the underlying matrix as `&dyn MatrixGroupOps`. + pub(crate) fn with_matrix(&self, f: impl FnOnce(&dyn MatrixGroupOps) -> R) -> R { + match self { + SrcLayerData::Presence(_, mat) => f(mat), + SrcLayerData::Count(_, mat) => f(mat), + } + } } // ── helpers ─────────────────────────────────────────────────────────────────── diff --git a/src/obikpartitionner/src/rebuild_layer.rs b/src/obikpartitionner/src/rebuild_layer.rs index 6bd40f3..b8893ef 100644 --- a/src/obikpartitionner/src/rebuild_layer.rs +++ b/src/obikpartitionner/src/rebuild_layer.rs @@ -1,8 +1,9 @@ use std::path::Path; use obicompactvec::{ - PersistentBitMatrixBuilder, PersistentBitVecBuilder, PersistentCompactIntMatrixBuilder, - PersistentCompactIntVecBuilder, + FilterMask, eval_filter_mask, + PersistentBitMatrixBuilder, PersistentBitVecBuilder, + PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, }; use obidebruinj::GraphDeBruijn; use obikseq::CanonicalKmer; @@ -10,18 +11,135 @@ use obilayeredmap::meta::PartitionMeta; use obilayeredmap::{IndexMode, MphfLayer}; use obiskio::{SKError, SKResult, UnitigFileReader}; -use crate::common::{ColBuilder, col_path_bit, col_path_int, load_meta, olm_to_sk, write_matrix_meta}; -use crate::filter::{KmerFilter, passes_all}; +use crate::common::{load_meta, olm_to_sk}; +use crate::filter::KmerFilter; use crate::graph_pipeline::materialize_layer; use crate::merge_layer::{MergeMode, SrcLayerData}; use crate::partition::KmerPartition; const INDEX_SUBDIR: &str = "index"; -/// Iterate all kmers in `src_index_dir` that pass `filters`, yielding `(kmer, row)`. +// ── Builders — pair matrix builder + column builders for one mode ───────────── + +enum Builders { + Presence(PersistentBitMatrixBuilder, Vec), + Count(PersistentCompactIntMatrixBuilder, Vec), +} + +impl Builders { + fn new(mode: MergeMode, n: usize, dir: &Path, n_genomes: usize) -> SKResult { + match mode { + MergeMode::Presence => { + let mut mat = PersistentBitMatrixBuilder::new(n, dir).map_err(SKError::Io)?; + let mut cols = Vec::with_capacity(n_genomes); + for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); } + Ok(Builders::Presence(mat, cols)) + } + MergeMode::Count => { + let mut mat = PersistentCompactIntMatrixBuilder::new(n, dir).map_err(SKError::Io)?; + let mut cols = Vec::with_capacity(n_genomes); + for _ in 0..n_genomes { cols.push(mat.add_col().map_err(SKError::Io)?); } + Ok(Builders::Count(mat, cols)) + } + } + } + + fn set_val(&mut self, col: usize, slot: usize, value: u32) { + match self { + Builders::Presence(_, cols) => cols[col].set(slot, value > 0), + Builders::Count(_, cols) => cols[col].set(slot, value), + } + } + + fn close(self) -> SKResult<()> { + match self { + Builders::Presence(mat, cols) => { + for b in cols { b.close().map_err(SKError::Io)?; } + mat.close().map_err(SKError::Io) + } + Builders::Count(mat, cols) => { + for b in cols { b.close().map_err(SKError::Io)?; } + mat.close().map_err(SKError::Io) + } + } + } +} + +// ── try_compute_combined_mask ───────────────────────────────────────────────── + +/// Build a per-slot `TempBitVec` mask from `filters` using column operations +/// on the source matrix — no per-kmer MPHF lookup or row read needed. /// -/// Uses [`SrcLayerData`] semantics: counts take priority over presence when -/// `mode = Count`; presence (or implicit all-ones) is used for `Presence`. +/// Returns `Some(mask)` when every filter in `filters` can express itself as +/// a [`FilterMask`] expression. Returns `None` when any filter requires +/// row-level inspection (fall back to `passes_all`). +fn try_compute_combined_mask( + filters: &[Box], + src_data: &SrcLayerData, + n_genomes: usize, +) -> SKResult> { + if filters.is_empty() { + return Ok(None); + } + let mut exprs: Vec = Vec::with_capacity(filters.len()); + for f in filters { + match f.column_mask_expr(n_genomes) { + Some(expr) => exprs.push(expr), + None => return Ok(None), + } + } + let combined = FilterMask::And(exprs); + let n = src_data.n_slots(); + let mask = src_data + .with_matrix(|mat| eval_filter_mask(&combined, mat, n)) + .map_err(SKError::Io)?; + Ok(Some(mask)) +} + +// ── iter_src_kmers_masked (pass 1) ──────────────────────────────────────────── + +/// Iterate all passing kmers in `src_index_dir`, yielding only the kmer value. +/// +/// When all filters can be expressed as column operations, a per-slot mask is +/// computed once per layer and used for O(1) slot-check per kmer instead of a +/// full row read. Falls back to row-level `passes_all` otherwise. +fn iter_src_kmers_masked( + src_index_dir: &Path, + mode: MergeMode, + n_genomes: usize, + filters: &[Box], + mut cb: impl FnMut(CanonicalKmer), +) -> SKResult<()> { + let src_meta = load_meta(src_index_dir, "rebuild")?; + for l in 0..src_meta.n_layers { + let src_layer_dir = src_index_dir.join(format!("layer_{l}")); + let unitigs_path = src_layer_dir.join("unitigs.bin"); + if !unitigs_path.exists() { continue; } + + let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?; + let reader = UnitigFileReader::open_sequential(&unitigs_path)?; + + for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { + let slot = src_data.slot(kmer); + let passes = match &mask { + Some(m) => m.get(slot), + None => { + let row = src_data.fill_row_by_slot(slot, n_genomes); + filters.iter().all(|f| f.passes(&row, n_genomes)) + } + }; + if passes { cb(kmer); } + } + } + Ok(()) +} + +// ── iter_src_layers (pass 2) ────────────────────────────────────────────────── + +/// Iterate all passing kmers in `src_index_dir`, yielding `(kmer, row)`. +/// +/// When the slot mask is available, skips the row read for filtered-out slots. fn iter_src_layers( src_index_dir: &Path, mode: MergeMode, @@ -33,17 +151,23 @@ fn iter_src_layers( for l in 0..src_meta.n_layers { let src_layer_dir = src_index_dir.join(format!("layer_{l}")); let unitigs_path = src_layer_dir.join("unitigs.bin"); - if !unitigs_path.exists() { - continue; - } + if !unitigs_path.exists() { continue; } - let reader = UnitigFileReader::open_sequential(&unitigs_path)?; let src_data = SrcLayerData::open(&src_layer_dir, mode)?; + let mask = try_compute_combined_mask(filters, &src_data, n_genomes)?; + let reader = UnitigFileReader::open_sequential(&unitigs_path)?; for (kmer, _, _) in reader.iter_indexed_canonical_kmers() { - let row = src_data.lookup(kmer, n_genomes); - if passes_all(filters, &row, n_genomes) { + let slot = src_data.slot(kmer); + if let Some(ref m) = mask { + if !m.get(slot) { continue; } + let row = src_data.fill_row_by_slot(slot, n_genomes); cb(kmer, row.into_boxed_slice()); + } else { + let row = src_data.fill_row_by_slot(slot, n_genomes); + if filters.iter().all(|f| f.passes(&row, n_genomes)) { + cb(kmer, row.into_boxed_slice()); + } } } } @@ -81,7 +205,7 @@ impl KmerPartition { // ── Pass 1: collect filtered kmers into de Bruijn graph ─────────────── let mut g = GraphDeBruijn::new(); - iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, _row| { + iter_src_kmers_masked(&src_index_dir, mode, n_genomes, filters, |kmer| { g.push(kmer); })?; @@ -100,54 +224,22 @@ impl KmerPartition { // ── Prepare matrix builders (one column per genome) ─────────────────── let data_dir = match mode { MergeMode::Presence => dst_layer_dir.join("presence"), - MergeMode::Count => dst_layer_dir.join("counts"), + MergeMode::Count => dst_layer_dir.join("counts"), }; std::fs::create_dir_all(&data_dir)?; - - let mut builders: Vec = match mode { - MergeMode::Presence => { - PersistentBitMatrixBuilder::new(n_new, &data_dir) - .map_err(SKError::Io)? - .close() - .map_err(SKError::Io)?; - (0..n_genomes) - .map(|g| -> SKResult { - let b = PersistentBitVecBuilder::new(n_new, &col_path_bit(&data_dir, g))?; - Ok(ColBuilder::Bit(b)) - }) - .collect::>()? - } - MergeMode::Count => { - PersistentCompactIntMatrixBuilder::new(n_new, &data_dir) - .map_err(SKError::Io)? - .close() - .map_err(SKError::Io)?; - (0..n_genomes) - .map(|g| -> SKResult { - let b = PersistentCompactIntVecBuilder::new( - n_new, - &col_path_int(&data_dir, g), - )?; - Ok(ColBuilder::Int(b)) - }) - .collect::>()? - } - }; + let mut builders = Builders::new(mode, n_new, &data_dir, n_genomes)?; // ── Pass 2: fill builders ───────────────────────────────────────────── iter_src_layers(&src_index_dir, mode, n_genomes, filters, |kmer, row| { if let Some(slot) = dst_mphf.find(kmer) { for (col, &value) in row.iter().enumerate() { - builders[col].set_val(slot, value); + builders.set_val(col, slot, value); } } })?; - // ── Close builders, write metadata ──────────────────────────────────── - for b in builders { - b.close()?; - } - write_matrix_meta(&data_dir, n_new, n_genomes).map_err(SKError::Io)?; + // ── Close builders and write metadata ───────────────────────────────── + builders.close()?; PartitionMeta { n_layers: 1, diff --git a/src/obikpartitionner/src/select_layer.rs b/src/obikpartitionner/src/select_layer.rs index 36286c0..c7f45e4 100644 --- a/src/obikpartitionner/src/select_layer.rs +++ b/src/obikpartitionner/src/select_layer.rs @@ -3,8 +3,9 @@ use std::io; use std::path::{Path, PathBuf}; use obicompactvec::{ - PersistentBitMatrix, PersistentBitMatrixBuilder, PersistentBitVecBuilder, - PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, PersistentCompactIntVecBuilder, + ColGroup, MatrixGroupOps, + PersistentBitMatrix, PersistentBitMatrixBuilder, + PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, }; use obilayeredmap::meta::PartitionMeta; use obilayeredmap::OLMError; @@ -40,52 +41,6 @@ pub struct OutputCol { pub op: AggOp, } -// ── Aggregation ─────────────────────────────────────────────────────────────── - -#[inline] -fn aggregate(op: AggOp, indices: &[usize], src_row: &[u32], threshold: u32) -> u32 { - match op { - AggOp::Any => { - if indices.iter().any(|&i| src_row[i] > threshold) { 1 } else { 0 } - } - AggOp::All => { - if indices.is_empty() { return 0; } - if indices.iter().all(|&i| src_row[i] > threshold) { 1 } else { 0 } - } - AggOp::None => { - if indices.iter().all(|&i| src_row[i] <= threshold) { 1 } else { 0 } - } - AggOp::Sum => { - indices.iter().map(|&i| src_row[i]).fold(0u32, |a, b| a.saturating_add(b)) - } - AggOp::Min => indices.iter().map(|&i| src_row[i]).min().unwrap_or(0), - AggOp::Max => indices.iter().map(|&i| src_row[i]).max().unwrap_or(0), - } -} - -// ── ColBuilder ──────────────────────────────────────────────────────────────── - -enum ColBuilder { - Bit(PersistentBitVecBuilder), - Int(PersistentCompactIntVecBuilder), -} - -impl ColBuilder { - fn set_val(&mut self, slot: usize, value: u32) { - match self { - ColBuilder::Bit(b) => b.set(slot, value > 0), - ColBuilder::Int(b) => b.set(slot, value), - } - } - - fn close(self) -> SKResult<()> { - match self { - ColBuilder::Bit(b) => b.close().map_err(SKError::Io), - ColBuilder::Int(b) => b.close().map_err(SKError::Io), - } - } -} - // ── Helpers ─────────────────────────────────────────────────────────────────── fn olm_to_sk(e: OLMError) -> SKError { @@ -95,21 +50,6 @@ fn olm_to_sk(e: OLMError) -> SKError { } } -fn col_path_bit(dir: &Path, col: usize) -> PathBuf { - dir.join(format!("col_{col:06}.pbiv")) -} - -fn col_path_int(dir: &Path, col: usize) -> PathBuf { - dir.join(format!("col_{col:06}.pciv")) -} - -fn write_matrix_meta(dir: &Path, n: usize, n_cols: usize) -> io::Result<()> { - fs::write( - dir.join("meta.json"), - format!("{{\"n\":{n},\"n_cols\":{n_cols}}}\n"), - ) -} - /// Copy all plain files (not subdirectories) from `src_dir` to `dst_dir`. fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> { for entry in fs::read_dir(src_dir)? { @@ -125,30 +65,64 @@ fn copy_layer_files(src_dir: &Path, dst_dir: &Path) -> io::Result<()> { // ── fill_builders ───────────────────────────────────────────────────────────── fn fill_builders( - builders: &mut [ColBuilder], specs: &[OutputCol], - n: usize, - n_src: usize, src_layer_dir: &Path, src_is_count: bool, threshold: u32, + output_presence: bool, + mut dst_bit: Option<&mut PersistentBitMatrixBuilder>, + mut dst_int: Option<&mut PersistentCompactIntMatrixBuilder>, ) -> SKResult<()> { - let mut src_buf = vec![0u32; n_src]; - if src_is_count { let mat = PersistentCompactIntMatrix::open(src_layer_dir).map_err(SKError::Io)?; - for slot in 0..n { - mat.fill_row(slot, &mut src_buf); - for (col, spec) in specs.iter().enumerate() { - builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold)); + for spec in specs { + let g = ColGroup::new(&spec.label, spec.indices.clone()); + if output_presence { + let b = dst_bit.as_deref_mut().unwrap(); + match spec.op { + AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?), + AggOp::All => b.add_col_from (&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?), + AggOp::None => b.add_col_from (&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?), + AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?), + AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?), + AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?), + }.map_err(SKError::Io)?; + } else { + let b = dst_int.as_deref_mut().unwrap(); + match spec.op { + AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?), + AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?), + AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?), + AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, threshold).map_err(SKError::Io)?), + AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, threshold).map_err(SKError::Io)?), + AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, threshold).map_err(SKError::Io)?), + }.map_err(SKError::Io)?; } } } else { let mat = PersistentBitMatrix::open(src_layer_dir).map_err(SKError::Io)?; - for slot in 0..n { - mat.fill_row(slot, &mut src_buf); - for (col, spec) in specs.iter().enumerate() { - builders[col].set_val(slot, aggregate(spec.op, &spec.indices, &src_buf, threshold)); + for spec in specs { + let g = ColGroup::new(&spec.label, spec.indices.clone()); + if output_presence { + let b = dst_bit.as_deref_mut().unwrap(); + match spec.op { + AggOp::Any => b.add_col_from (&mat.partial_group_any (&g, 1).map_err(SKError::Io)?), + AggOp::All => b.add_col_from (&mat.partial_group_all (&g, 1).map_err(SKError::Io)?), + AggOp::None => b.add_col_from (&mat.partial_group_none(&g, 1).map_err(SKError::Io)?), + AggOp::Sum => b.add_col_from_int(&mat.partial_group_sum (&g).map_err(SKError::Io)?), + AggOp::Min => b.add_col_from_int(&mat.partial_group_min (&g).map_err(SKError::Io)?), + AggOp::Max => b.add_col_from_int(&mat.partial_group_max (&g).map_err(SKError::Io)?), + }.map_err(SKError::Io)?; + } else { + let b = dst_int.as_deref_mut().unwrap(); + match spec.op { + AggOp::Sum => b.add_col_from (&mat.partial_group_sum (&g).map_err(SKError::Io)?), + AggOp::Min => b.add_col_from (&mat.partial_group_min (&g).map_err(SKError::Io)?), + AggOp::Max => b.add_col_from (&mat.partial_group_max (&g).map_err(SKError::Io)?), + AggOp::Any => b.add_col_from_bit(&mat.partial_group_any (&g, 1).map_err(SKError::Io)?), + AggOp::All => b.add_col_from_bit(&mat.partial_group_all (&g, 1).map_err(SKError::Io)?), + AggOp::None => b.add_col_from_bit(&mat.partial_group_none(&g, 1).map_err(SKError::Io)?), + }.map_err(SKError::Io)?; } } } @@ -168,7 +142,7 @@ impl KmerPartition { src: &KmerPartition, i: usize, specs: &[OutputCol], - n_src_genomes: usize, + _n_src_genomes: usize, threshold: u32, output_presence: bool, in_place: bool, @@ -188,7 +162,6 @@ impl KmerPartition { fs::create_dir_all(&dst_index_dir)?; } - let n_out = specs.len(); let data_subdir = if output_presence { "presence" } else { "counts" }; for l in 0..src_meta.n_layers { @@ -201,7 +174,7 @@ impl KmerPartition { let presence_dir = src_layer_dir.join("presence"); let src_is_count = counts_dir.exists() && !presence_dir.exists(); - // Determine number of slots from the source matrix. + // Determine number of slots and detect implicit layers. let n = if counts_dir.exists() { PersistentCompactIntMatrix::open(&src_layer_dir).map_err(SKError::Io)?.n() } else if presence_dir.exists() { @@ -216,7 +189,7 @@ impl KmerPartition { }; // Choose the output data directory (temp name for in-place). - let (dst_data_dir, final_data_dir) = if in_place { + let (dst_data_dir, final_data_dir): (PathBuf, PathBuf) = if in_place { let tmp = dst_layer_dir.join(format!("{data_subdir}_new")); let perm = dst_layer_dir.join(data_subdir); (tmp, perm) @@ -231,37 +204,22 @@ impl KmerPartition { } fs::create_dir_all(&dst_data_dir)?; - // Initialise packed-format skeleton. - if output_presence { - PersistentBitMatrixBuilder::new(n, &dst_data_dir) - .map_err(SKError::Io)?.close().map_err(SKError::Io)?; + let (mut dst_bit, mut dst_int) = if output_presence { + (Some(PersistentBitMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?), None) } else { - PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir) - .map_err(SKError::Io)?.close().map_err(SKError::Io)?; - } - - // Create column builders. - let mut builders: Vec = (0..n_out) - .map(|col| -> SKResult { - if output_presence { - Ok(ColBuilder::Bit(PersistentBitVecBuilder::new( - n, &col_path_bit(&dst_data_dir, col), - )?)) - } else { - Ok(ColBuilder::Int(PersistentCompactIntVecBuilder::new( - n, &col_path_int(&dst_data_dir, col), - )?)) - } - }) - .collect::>()?; + (None, Some(PersistentCompactIntMatrixBuilder::new(n, &dst_data_dir).map_err(SKError::Io)?)) + }; fill_builders( - &mut builders, specs, n, n_src_genomes, - &src_layer_dir, src_is_count, threshold, + specs, &src_layer_dir, src_is_count, threshold, output_presence, + dst_bit.as_mut(), dst_int.as_mut(), )?; - for b in builders { b.close()?; } - write_matrix_meta(&dst_data_dir, n, n_out).map_err(SKError::Io)?; + if output_presence { + dst_bit.unwrap().close().map_err(SKError::Io)?; + } else { + dst_int.unwrap().close().map_err(SKError::Io)?; + } // In-place: swap old data dir for new. if in_place { diff --git a/src/obilayeredmap/src/layer.rs b/src/obilayeredmap/src/layer.rs index 72b38ea..475bca7 100644 --- a/src/obilayeredmap/src/layer.rs +++ b/src/obilayeredmap/src/layer.rs @@ -106,11 +106,7 @@ impl Layer<()> { let presence_dir = layer_dir.join(PRESENCE_DIR); fs::create_dir_all(&presence_dir).map_err(OLMError::Io)?; let mut mb = PersistentBitMatrixBuilder::new(n_kmers, &presence_dir).map_err(OLMError::Io)?; - let mut col = mb.add_col().map_err(OLMError::Io)?; - for slot in 0..n_kmers { - col.set(slot, true); - } - col.close().map_err(OLMError::Io)?; + mb.add_col_ones().map_err(OLMError::Io)?.close().map_err(OLMError::Io)?; mb.close().map_err(OLMError::Io) } } diff --git a/src/obitaxonomy/Cargo.toml b/src/obitaxonomy/Cargo.toml new file mode 100644 index 0000000..b391f4d --- /dev/null +++ b/src/obitaxonomy/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "obitaxonomy" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/src/obitaxonomy/src/error.rs b/src/obitaxonomy/src/error.rs new file mode 100644 index 0000000..5f4f24e --- /dev/null +++ b/src/obitaxonomy/src/error.rs @@ -0,0 +1,38 @@ +use std::fmt; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum TaxError { + /// Stored value does not start with the `taxonomy:/` prefix. + MissingPrefix, + /// Stored path contains no segments after the prefix. + EmptyPath, + /// Query pattern contains no segments (after stripping anchors). + EmptyPattern, + /// A segment has an empty name (e.g. consecutive `/`). + EmptySegmentName, + /// A segment has a trailing `@` with no rank name. + EmptyRankName { segment: String }, + /// A segment contains more than one `@`. + AmbiguousRank { segment: String }, +} + +impl fmt::Display for TaxError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + TaxError::MissingPrefix => + write!(f, "taxonomy path must start with \"taxonomy:/\""), + TaxError::EmptyPath => + write!(f, "taxonomy path has no segments"), + TaxError::EmptyPattern => + write!(f, "taxonomy query pattern has no segments"), + TaxError::EmptySegmentName => + write!(f, "segment has an empty name"), + TaxError::EmptyRankName { segment } => + write!(f, "segment has '@' with no rank name: {segment:?}"), + TaxError::AmbiguousRank { segment } => + write!(f, "segment contains more than one '@': {segment:?}"), + } + } +} + +impl std::error::Error for TaxError {} diff --git a/src/obitaxonomy/src/lib.rs b/src/obitaxonomy/src/lib.rs new file mode 100644 index 0000000..aea3cff --- /dev/null +++ b/src/obitaxonomy/src/lib.rs @@ -0,0 +1,11 @@ +mod error; +mod segment; +mod segment_pattern; +mod path; +mod pattern; + +pub use error::TaxError; +pub use segment::TaxSegment; +pub use segment_pattern::SegmentPattern; +pub use path::{TaxPath, PREFIX}; +pub use pattern::TaxPattern; diff --git a/src/obitaxonomy/src/path.rs b/src/obitaxonomy/src/path.rs new file mode 100644 index 0000000..096c09b --- /dev/null +++ b/src/obitaxonomy/src/path.rs @@ -0,0 +1,82 @@ +use std::fmt; +use std::str::FromStr; + +use crate::error::TaxError; +use crate::segment::TaxSegment; + +/// The prefix that marks a metadata value as a taxonomy path. +pub const PREFIX: &str = "taxonomy:/"; + +/// A rooted, `/`-separated taxonomy path with optional per-segment rank annotations. +/// +/// Stored form: `taxonomy:/seg1@rank1/seg2/seg3@rank3` +/// The leading `taxonomy:/` is the discriminator; the remainder is one or more +/// `/`-separated segments, each of the form `name` or `name@rank`. +/// +/// `@` is reserved and may not appear in segment names or rank names. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxPath { + segments: Vec, +} + +impl TaxPath { + pub fn parse(s: &str) -> Result { + let tail = s.strip_prefix(PREFIX).ok_or(TaxError::MissingPrefix)?; + if tail.is_empty() { + return Err(TaxError::EmptyPath); + } + let segments = tail.split('/') + .map(TaxSegment::parse) + .collect::, _>>()?; + Ok(Self { segments }) + } + + /// True if `self` is an ancestor of — or equal to — `other`. + /// + /// Comparison is by segment name only; rank annotations are ignored. + /// `self` must be a prefix of `other` at segment granularity. + pub fn is_ancestor_of(&self, other: &TaxPath) -> bool { + self.segments.len() <= other.segments.len() + && self.segments.iter().zip(other.segments.iter()) + .all(|(a, b)| a.name() == b.name()) + } + + /// Returns the name of the first segment whose rank equals `rank`, if any. + pub fn name_at_rank(&self, rank: &str) -> Option<&str> { + self.segments.iter() + .find(|s| s.rank() == Some(rank)) + .map(|s| s.name()) + } + + /// True if any segment has the given rank. + pub fn has_rank(&self, rank: &str) -> bool { + self.segments.iter().any(|s| s.rank() == Some(rank)) + } + + /// True if the path contains a segment with both the given rank and name. + pub fn matches_rank(&self, rank: &str, name: &str) -> bool { + self.segments.iter().any(|s| s.rank() == Some(rank) && s.name() == name) + } + + pub fn segments(&self) -> &[TaxSegment] { &self.segments } + pub fn depth(&self) -> usize { self.segments.len() } + pub fn is_empty(&self) -> bool { self.segments.is_empty() } +} + +impl fmt::Display for TaxPath { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", PREFIX)?; + let mut first = true; + for seg in &self.segments { + if !first { write!(f, "/")?; } + write!(f, "{seg}")?; + first = false; + } + Ok(()) + } +} + +impl FromStr for TaxPath { + type Err = TaxError; + fn from_str(s: &str) -> Result { Self::parse(s) } +} diff --git a/src/obitaxonomy/src/pattern.rs b/src/obitaxonomy/src/pattern.rs new file mode 100644 index 0000000..c0474d8 --- /dev/null +++ b/src/obitaxonomy/src/pattern.rs @@ -0,0 +1,72 @@ +use crate::error::TaxError; +use crate::path::TaxPath; +use crate::segment::TaxSegment; +use crate::segment_pattern::SegmentPattern; + +/// A query pattern for matching against stored `TaxPath` values. +/// +/// Syntax: +/// +/// | Form | Semantics | +/// |----------|-----------| +/// | `A/B` | A then B as a contiguous sub-path, anywhere in the value | +/// | `/A/B` | value starts with A then B (start-anchored) | +/// | `A/B$` | value ends with A then B (end-anchored) | +/// | `/A/B$` | value is exactly A then B (fully anchored) | +/// | `A@x/B` | A with rank `x`, followed by B with any rank | +/// +/// A segment pattern without `@` matches any segment with that name regardless +/// of its stored rank. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxPattern { + start_anchored: bool, + end_anchored: bool, + segments: Vec, +} + +impl TaxPattern { + pub fn parse(s: &str) -> Result { + let s = s.trim(); + + let start_anchored = s.starts_with('/'); + let s = if start_anchored { &s[1..] } else { s }; + + let end_anchored = s.ends_with('$'); + let s = if end_anchored { &s[..s.len() - 1] } else { s }; + + if s.is_empty() { + return Err(TaxError::EmptyPattern); + } + + let segments = s.split('/') + .map(SegmentPattern::parse) + .collect::, _>>()?; + + Ok(Self { start_anchored, end_anchored, segments }) + } + + /// True if this pattern matches `path` according to the anchor flags. + /// + /// The pattern must match a contiguous run of segments in the path. + /// Start/end anchors restrict where that run may begin or end. + pub fn matches(&self, path: &TaxPath) -> bool { + let n = self.segments.len(); + let m = path.depth(); + + if n > m { return false; } + + let segs = path.segments(); + match (self.start_anchored, self.end_anchored) { + (true, true) => n == m && self.window_matches(segs, 0), + (true, false) => self.window_matches(segs, 0), + (false, true) => self.window_matches(segs, m - n), + (false, false) => (0..=(m - n)).any(|i| self.window_matches(segs, i)), + } + } + + fn window_matches(&self, segs: &[TaxSegment], start: usize) -> bool { + self.segments.iter() + .zip(segs[start..start + self.segments.len()].iter()) + .all(|(pat, seg)| pat.matches(seg)) + } +} diff --git a/src/obitaxonomy/src/segment.rs b/src/obitaxonomy/src/segment.rs new file mode 100644 index 0000000..b06436d --- /dev/null +++ b/src/obitaxonomy/src/segment.rs @@ -0,0 +1,49 @@ +use std::fmt; + +use crate::error::TaxError; + +/// A single node in a taxonomy path: a name and an optional rank. +/// +/// Neither `name` nor `rank` may contain `@` (reserved separator). +/// Serialised form: `name` or `name@rank`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TaxSegment { + name: String, + rank: Option, +} + +impl TaxSegment { + pub fn parse(raw: &str) -> Result { + let parts: Vec<&str> = raw.splitn(3, '@').collect(); + + let (name_raw, rank_raw) = match parts.as_slice() { + [name] => (*name, None), + [name, rank] => (*name, Some(*rank)), + _ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }), + }; + + if name_raw.is_empty() { + return Err(TaxError::EmptySegmentName); + } + + let rank = match rank_raw { + None => None, + Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }), + Some(r) => Some(r.to_string()), + }; + + Ok(Self { name: name_raw.to_string(), rank }) + } + + pub fn name(&self) -> &str { &self.name } + pub fn rank(&self) -> Option<&str> { self.rank.as_deref() } +} + +impl fmt::Display for TaxSegment { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match &self.rank { + None => write!(f, "{}", self.name), + Some(r) => write!(f, "{}@{}", self.name, r), + } + } +} diff --git a/src/obitaxonomy/src/segment_pattern.rs b/src/obitaxonomy/src/segment_pattern.rs new file mode 100644 index 0000000..13895ed --- /dev/null +++ b/src/obitaxonomy/src/segment_pattern.rs @@ -0,0 +1,41 @@ +use crate::error::TaxError; +use crate::segment::TaxSegment; + +/// A single segment in a query pattern: a required name and an optional rank filter. +/// +/// If `rank` is `None`, the pattern matches any segment with the given name, +/// regardless of its stored rank. If `rank` is `Some(r)`, both name and rank +/// must match exactly. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SegmentPattern { + name: String, + rank: Option, +} + +impl SegmentPattern { + pub fn parse(raw: &str) -> Result { + let parts: Vec<&str> = raw.splitn(3, '@').collect(); + let (name_raw, rank_raw) = match parts.as_slice() { + [name] => (*name, None), + [name, rank] => (*name, Some(*rank)), + _ => return Err(TaxError::AmbiguousRank { segment: raw.to_string() }), + }; + if name_raw.is_empty() { + return Err(TaxError::EmptySegmentName); + } + let rank = match rank_raw { + None => None, + Some("") => return Err(TaxError::EmptyRankName { segment: raw.to_string() }), + Some(r) => Some(r.to_string()), + }; + Ok(Self { name: name_raw.to_string(), rank }) + } + + /// True if this pattern matches `seg`. + /// Name must match exactly. If a rank is specified in the pattern, the + /// segment's rank must match; otherwise any rank (or no rank) is accepted. + pub fn matches(&self, seg: &TaxSegment) -> bool { + self.name == seg.name() + && self.rank.as_deref().map_or(true, |r| seg.rank() == Some(r)) + } +} diff --git a/test.sk.fasta b/test.sk.fasta deleted file mode 100644 index ff8e303..0000000 --- a/test.sk.fasta +++ /dev/null @@ -1,28 +0,0 @@ ->F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"} -GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA ->C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"} -AAAAAAATTAAGTCCCATATTGAAACATTAT ->9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"} -AAAAAATTAAGTCCCATATTGAAACATTATC ->74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"} -AAAAATTAAGTCCCATATTGAAACATTATCA ->45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"} -AAAATTAAGTCCCATATTGAAACATTATCAC ->5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"} -AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT ->8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"} -AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG ->C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"} -AATGTGAGTTGTTAATATTACATAATTGGGTT ->933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"} -TGTGAGTTGTTAATATTACATAATTGGGTTT ->3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"} -GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT ->1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"} -TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC ->8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"} -AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT ->247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"} -CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT ->11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"} -GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG