From c694e1f2b0f3d8fbc1f3fdeed74093e1b65eb706 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 19 Jun 2026 09:55:41 +0200 Subject: [PATCH] feat: add benchmark pipeline, expose APIs, and enforce strict paths Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation. --- .DS_Store | Bin 10244 -> 10244 bytes .gitignore | 10 ++ .serena/.gitignore | 2 + .serena/project.yml | 133 ++++++++++++++++++ CLAUDE.md | 26 ++++ Makefile | 1 + benchmark/Makefile | 144 ++++++++++++++++++++ benchmark/README.md | 132 ++++++++++++++++++ benchmark/aggregate_stats.sh | 53 ++++++++ benchmark/build_reference.py | 137 +++++++++++++++++++ benchmark/build_reference.sh | 39 ++++++ benchmark/deps.mk | 199 +++++++++++++++++++++++++++ benchmark/downloads.sh | 48 +++++++ benchmark/filter_one_count.sh | 108 +++++++++++++++ benchmark/filter_one_presence.sh | 108 +++++++++++++++ benchmark/index_one_count.sh | 103 ++++++++++++++ benchmark/index_one_presence.sh | 102 ++++++++++++++ benchmark/make_deps.py | 118 ++++++++++++++++ benchmark/merge_count.sh | 103 ++++++++++++++ benchmark/merge_presence.sh | 104 ++++++++++++++ benchmark/simulate.sh | 12 ++ benchmark/simulate_one.sh | 33 +++++ benchmark/verify_count.py | 181 +++++++++++++++++++++++++ benchmark/verify_merge_count.py | 201 ++++++++++++++++++++++++++++ benchmark/verify_merge_count.sh | 27 ++++ benchmark/verify_merge_presence.py | 170 +++++++++++++++++++++++ benchmark/verify_merge_presence.sh | 27 ++++ benchmark/verify_one_count.sh | 30 +++++ benchmark/verify_one_presence.sh | 30 +++++ benchmark/verify_presence.py | 139 +++++++++++++++++++ docmd/implementation/filtering.md | 15 ++- mkdocs.yml | 1 + src/Cargo.lock | 4 + src/Cargo.toml | 2 +- src/obicompactvec/src/bitvec.rs | 8 +- src/obicompactvec/src/lib.rs | 8 +- src/obicompactvec/src/tempbitvec.rs | 23 ++-- src/obicompactvec/src/tempintvec.rs | 36 +++-- src/obidebruinj/src/debruijn.rs | 1 + src/obikindex/src/merge.rs | 4 +- src/obikmer/src/cmd/predicate.rs | 19 +-- test.sk.fasta | 28 ---- 42 files changed, 2585 insertions(+), 84 deletions(-) create mode 100644 .serena/.gitignore create mode 100644 .serena/project.yml create mode 100644 benchmark/Makefile create mode 100644 benchmark/README.md create mode 100755 benchmark/aggregate_stats.sh create mode 100755 benchmark/build_reference.py create mode 100755 benchmark/build_reference.sh create mode 100644 benchmark/deps.mk create mode 100755 benchmark/downloads.sh create mode 100755 benchmark/filter_one_count.sh create mode 100755 benchmark/filter_one_presence.sh create mode 100755 benchmark/index_one_count.sh create mode 100755 benchmark/index_one_presence.sh create mode 100644 benchmark/make_deps.py create mode 100755 benchmark/merge_count.sh create mode 100755 benchmark/merge_presence.sh create mode 100755 benchmark/simulate.sh create mode 100644 benchmark/simulate_one.sh create mode 100755 benchmark/verify_count.py create mode 100755 benchmark/verify_merge_count.py create mode 100755 benchmark/verify_merge_count.sh create mode 100755 benchmark/verify_merge_presence.py create mode 100755 benchmark/verify_merge_presence.sh create mode 100755 benchmark/verify_one_count.sh create mode 100755 benchmark/verify_one_presence.sh create mode 100755 benchmark/verify_presence.py delete mode 100644 test.sk.fasta diff --git a/.DS_Store b/.DS_Store index 5f0cbf7b1dbcb941bc5c3c51ebdd4b614453d7b7..96c32f71fc181e77027f3582347b8e854e7e5eb5 100644 GIT binary patch delta 1729 zcmeH{TWl0n7{|~50Od>=yJyRm9cbxpU7)e7r8i1Tp;or5NNZb5YcDh`yPW|h?9R}g z-9@c+Rd_)q3KNYnYT^?byd@Pi!3!Ei;^je8P0)ym4+axVyaf~EgJ))zhx(!qKKS4~ z%y<6ZneUr3`Tf5+GI(V0`=}0y5P*r#`b8qxqZ`YG(6MM+!zhFwzbaAN~|)n zp>rs4Ae}MHqxMh15FW5)zDiD8g;*+WrU+ToWpjT(lCM#29~~Q?knNZJmG(nb4YuKr z6?xv$hO~4trx&!IWKOdJWZ?zr#qwZv?b`YctsPyl?R|suKJKrO1M-T|f^O-lv^H`` zS9ANvlh%}GsJgLlLRU2-r|;FZtmLN+(y)=tXl|9VQmJA*!p8C`eLAn_?wl{W&1#ge z#80zHRUF_}Q*}yJub@U*!`Ln-AIoau?xp2A$a)ZPzY50I{luDcA z7RFU+B%MrYX|G$G+|GEfG+NNImTnp%S@c&Zo$^-ZM^qY{GBX)*a?5+TTiM3=8MfPb zd0*1X4II>rtH+8fH#2^oO<4PrS#98up=G_AxYEm5g8SUOQxsSAoLSneQ3ek zs-jZ`_xp;s4Jk^3bKNNDIRU{mc8ZX<_-Bl<$W;=N?s{(-IcuDCSMHZbx*NT;L>O63 znn@4oBYQ}e%#dT`aqa0F?+rL9rI~SdZ(`fKK!v zj+-!qU6{mfB%mS#0|zmU0*bg7_v0{*;sk6wipTIYPU2ad!t*$dx9~RJ!MperU*ao# zjf?mmKjRntO3SE^a_Xm5bPe4^JLqQGNw?5A?Wd!3l4@b^11P*mCQGpt=w8Rmb|SD0 zb0rgpw%@X&w{L-o+x~%xFO)k@TOO!f5ez$CZr{4i_Eis*D=Sx3Rj-yqBf>Y@9rk5K zDe|R`8`!+FReP-*;!A{$Jh`q;$TDFePp*%KLYxZcda|)8B=beWmy%n%F)AcVqGPix z^TooU-d@)YLK0^>JRR#1ogBL{R{WQ+Um)L-OXNrLhrnKr z5uEoRg(;V{DM)9r&%)ifN8r9s0Dln2@Gu_1aXf)1@stbwIT!j1conbX4Bo(-IEVB2 z5Fg=Ve1b3V4KCpaT=@&x{)J@mn?rW4CHMiGTN?g-aj#`&b1^fW&lqF+lo$gs*KMd02=A#~QJ6M#(;E-EEXC$L$HtyQ8tN)b(=>(fG}`ZoO@L_wL?%2KKE4MW`_x z;oSW(wz(War>=WcY`v5d3t4-9!7eR|Hcj)`<`&5xFx81e1=BuaTc)_6`Rg~fNO^x{ zCRMalHLDvowM#zc*pFJms4nd2TDh_?b-&S$JGeY})G9i5 zAurNZa#w`QiJ8Letd%c0Rc_B-DW@uZ2Mwn*JZI(Aoj$5IPpP>>&O=7g8a|e{idD8> zv4=&_<13r`OuJO@CpUx|(@Mu=<-8u#^i(v8P*7&%h?|y@%}=|-;aE9u%vx>91bJV? zAJ!^Jqh?A_FAdNX&Cwz)(Tnsdou>JkEaGWAgJoZhk99om({V+ofoJ=yP zM20&VS>JW;?=u+hzxTd@^x!{auu0diHu$M)x-r^XiQ>-PJ?>!BIO8|DKZIiLQoNz8 z^0y(rk>8@JNQl}M54Q6ynhJxc^I>Ovo2K$7mD6=chfnZAte)IP`C9{+O=Nm-G#NuWbGUHP8@6D>h>r zc480qDq~aV!#)gSzp}St?i8kBUyDP{7KvbkdNNxX!Y@d{4k z4ZMjnSjKr=#M^jB+5Q1O!56rOZ#@yTX6BHO|El=K9g4{1Wc0H3!jtvO>snU>`h-1G W$d9Ub>h6y<{qN9!!XY;q{p?Q;+ev2t diff --git a/.gitignore b/.gitignore index 76d17de..ec94743 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,13 @@ data-stress ./**/*.json *.bin Betula_exilis--IGA-24-33 +benchmark/genomes +benchmark/simulated_data +benchmark/specimen_index_presence +benchmark/specimen_index_count +benchmark/global_index_presence +benchmark/global_index_count +benchmark/stats +benchmark/reference_index +benchmark/specific_index_count +benchmark/specific_index_presence diff --git a/.serena/.gitignore b/.serena/.gitignore new file mode 100644 index 0000000..2e510af --- /dev/null +++ b/.serena/.gitignore @@ -0,0 +1,2 @@ +/cache +/project.local.yml diff --git a/.serena/project.yml b/.serena/project.yml new file mode 100644 index 0000000..1a35e2f --- /dev/null +++ b/.serena/project.yml @@ -0,0 +1,133 @@ +# the name by which the project can be referenced within Serena +project_name: "obikmer" + + +# list of languages for which language servers are started; choose from: +# al angular ansible bash clojure +# cpp cpp_ccls crystal csharp csharp_omnisharp +# dart elixir elm erlang fortran +# fsharp go groovy haskell haxe +# hlsl html java json julia +# kotlin lean4 lua luau markdown +# matlab msl nix ocaml pascal +# perl php php_phpactor powershell python +# python_jedi python_ty r rego ruby +# ruby_solargraph rust scala scss solidity +# svelte swift systemverilog terraform toml +# typescript typescript_vts vue yaml zig +# (This list may be outdated. For the current list, see values of Language enum here: +# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py +# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.) +# Note: +# - For C, use cpp +# - For JavaScript, use typescript +# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root) +# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm) +# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three) +# - For Free Pascal/Lazarus, use pascal +# Special requirements: +# Some languages require additional setup/installations. +# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers +# When using multiple languages, the first language server that supports a given file will be used for that file. +# The first language is the default language and the respective language server will be used as a fallback. +# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored. +languages: +- rust + +# the encoding used by text files in the project +# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings +encoding: "utf-8" + +# line ending convention to use when writing source files. +# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default) +# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings. +line_ending: + +# The language backend to use for this project. +# If not set, the global setting from serena_config.yml is used. +# Valid values: LSP, JetBrains +# Note: the backend is fixed at startup. If a project with a different backend +# is activated post-init, an error will be returned. +language_backend: + +# whether to use project's .gitignore files to ignore files +ignore_all_files_in_gitignore: true + +# advanced configuration option allowing to configure language server-specific options. +# Maps the language key to the options. +# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available. +# No documentation on options means no options are available. +ls_specific_settings: {} + +# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos). +# Paths can be absolute or relative to the project root. +# Each folder is registered as an LSP workspace folder, enabling language servers to discover +# symbols and references across package boundaries. +# Currently supported for: TypeScript. +# Example: +# additional_workspace_folders: +# - ../sibling-package +# - ../shared-lib +additional_workspace_folders: [] + +# list of additional paths to ignore in this project. +# Same syntax as gitignore, so you can use * and **. +# Note: global ignored_paths from serena_config.yml are also applied additively. +ignored_paths: [] + +# whether the project is in read-only mode +# If set to true, all editing tools will be disabled and attempts to use them will result in an error +# Added on 2025-04-18 +read_only: false + +# list of tool names to exclude. +# This extends the existing exclusions (e.g. from the global configuration) +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +excluded_tools: [] + +# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default). +# This extends the existing inclusions (e.g. from the global configuration). +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +included_optional_tools: [] + +# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools. +# This cannot be combined with non-empty excluded_tools or included_optional_tools. +# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html +fixed_tools: [] + +# list of mode names that are to be activated by default, overriding the setting in the global configuration. +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply. +# Otherwise, this overrides the setting from the global configuration (serena_config.yml). +# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply +# for this project. +# This setting can, in turn, be overridden by CLI parameters (--mode). +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +default_modes: + +# list of mode names to be activated additionally for this project, e.g. ["query-projects"] +# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes. +# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes +added_modes: + +# initial prompt for the project. It will always be given to the LLM upon activating the project +# (contrary to the memories, which are loaded on demand). +initial_prompt: "" + +# time budget (seconds) per tool call for the retrieval of additional symbol information +# such as docstrings or parameter information. +# This overrides the corresponding setting in the global configuration; see the documentation there. +# If null or missing, use the setting from the global configuration. +symbol_info_budget: + +# list of regex patterns which, when matched, mark a memory entry as read‑only. +# Extends the list from the global configuration, merging the two lists. +read_only_memory_patterns: [] + +# list of regex patterns for memories to completely ignore. +# Matching memories will not appear in list_memories or activate_project output +# and cannot be accessed via read_memory or write_memory. +# To access ignored memory files, use the read_file tool on the raw file path. +# Extends the list from the global configuration, merging the two lists. +# Example: ["_archive/.*", "_episodes/.*"] +ignored_memory_patterns: [] diff --git a/CLAUDE.md b/CLAUDE.md index 6fa8412..c6cac5a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s --- Je continue à poser mes questions et à guider la discussion. + +--- + +## MCP Tools + +**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.** + +### Hiérarchie des outils pour ce projet Rust + +**Navigation et édition de code → serena en priorité** +- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations` +- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols` +- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file` +- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview` +- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body` +- Ne pas utiliser `cclsp` quand serena couvre le besoin + +**Analyse architecturale → jcodemunch** +- Hotspots, couplage, dead code, dépendances entre modules +- Utiliser avant de refactorer une zone critique + +**Raisonnement complexe → sequential-thinking** +- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux + +**Documentation de crates → context7** +- Toujours consulter avant d'utiliser une API de bibliothèque externe diff --git a/Makefile b/Makefile index e203e6a..0fe6d46 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate mkdocs mkdocs-material \ mkdocs-mermaid2-plugin \ mkdocs-bibtex + $(PIP) install --quiet --upgrade InSilicoSeq # ── obikmer binary ─────────────────────────────────────────────────────────── diff --git a/benchmark/Makefile b/benchmark/Makefile new file mode 100644 index 0000000..5654ecc --- /dev/null +++ b/benchmark/Makefile @@ -0,0 +1,144 @@ +# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS +BINARY := ../src/target/release/obikmer +VENV_PY := ../.venv/bin/python3 + +GENOMES := $(wildcard genomes/*.fna.gz) + +# SPECIMENS, SPECIES, and the full dependency graph are generated by +# make_deps.py from the genome FASTA headers — like .d files in C. +# Make rebuilds deps.mk whenever genomes/ changes and restarts. +-include deps.mk + +REF_NPZS := $(SPECIMENS:%=reference_index/%.npz) +PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done) +PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats) +COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done) +COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats) +VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats) +VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats) +SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done) +SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats) +SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done) +SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats) +SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz) + +.NOTPARALLEL: + +.PHONY: all simulate reference \ + index_presence index_count \ + aggregate_index_presence aggregate_index_count \ + merge_presence merge_count \ + verify_presence verify_count \ + aggregate_verify_presence aggregate_verify_count \ + verify_merge_presence verify_merge_count \ + filter_presence filter_count \ + aggregate_filter_presence aggregate_filter_count + +verify_merge_presence: stats/verify_merge_presence/current.csv +verify_merge_count: stats/verify_merge_count/current.csv + +all: aggregate_verify_presence aggregate_verify_count \ + verify_merge_presence verify_merge_count \ + aggregate_filter_presence aggregate_filter_count + +# ── dependency file ─────────────────────────────────────────────────────────── + +deps.mk: $(GENOMES) + $(VENV_PY) make_deps.py $^ > $@ + +# ── simulation ──────────────────────────────────────────────────────────────── +# Prerequisites (genome → reads) are in deps.mk; $< is the genome file. + +$(SIMULATED_READS): + bash simulate_one.sh $< $(dir $@) + +simulate: $(SIMULATED_READS) + +# ── reference kmer sets ─────────────────────────────────────────────────────── +# Prerequisites (reads → npz) are in deps.mk. + +reference_index/%.npz: + bash build_reference.sh $* + +reference: $(REF_NPZS) + +# ── per-specimen indexing ───────────────────────────────────────────────────── +# Prerequisites (reads → index.done + .stats) are in deps.mk. + +specimen_index_presence/%/index.done \ +stats/indexing_presence/%.stats &: $(BINARY) + bash index_one_presence.sh $* + +specimen_index_count/%/index.done \ +stats/indexing_count/%.stats &: $(BINARY) + bash index_one_count.sh $* + +index_presence: $(PRESENCE_DONE) +index_count: $(COUNT_DONE) + +# ── indexing stats aggregation ──────────────────────────────────────────────── + +aggregate_index_presence: $(PRESENCE_STATS) + bash aggregate_stats.sh indexing_presence + +aggregate_index_count: $(COUNT_STATS) + bash aggregate_stats.sh indexing_count + +# ── global merge ────────────────────────────────────────────────────────────── + +global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY) + bash merge_presence.sh + +global_index_count/index.done: $(COUNT_DONE) $(BINARY) + bash merge_count.sh + +merge_presence: global_index_presence/index.done +merge_count: global_index_count/index.done + +# ── per-specimen verification ───────────────────────────────────────────────── +# Prerequisites (index.done + npz → .stats) are in deps.mk. + +stats/verify_presence/%.stats: + bash verify_one_presence.sh $* + +stats/verify_count/%.stats: + bash verify_one_count.sh $* + +verify_presence: $(VERIFY_PRESENCE_STATS) +verify_count: $(VERIFY_COUNT_STATS) + +# ── verification stats aggregation ─────────────────────────────────────────── + +aggregate_verify_presence: $(VERIFY_PRESENCE_STATS) + bash aggregate_stats.sh verify_presence + +aggregate_verify_count: $(VERIFY_COUNT_STATS) + bash aggregate_stats.sh verify_count + +# ── species-specific indexes ────────────────────────────────────────────────── +# Prerequisites (global index → specific index) are in deps.mk. + +specific_index_presence/%/index.done \ +stats/specific_kmer_presence/%.stats &: $(BINARY) + bash filter_one_presence.sh $* + +specific_index_count/%/index.done \ +stats/specific_kmer_count/%.stats &: $(BINARY) + bash filter_one_count.sh $* + +filter_presence: $(SPECIFIC_PRESENCE_DONE) +filter_count: $(SPECIFIC_COUNT_DONE) + +aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS) + bash aggregate_stats.sh specific_kmer_presence + +aggregate_filter_count: $(SPECIFIC_COUNT_STATS) + bash aggregate_stats.sh specific_kmer_count + +# ── merged index verification ───────────────────────────────────────────────── + +stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done + bash verify_merge_presence.sh + +stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done + bash verify_merge_count.sh diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 0000000..04ad741 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,132 @@ +# Benchmark pipeline + +Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`. + +``` +gmake all # full pipeline +gmake simulate # simulation only +gmake reference # reference kmer sets only +``` + +## Pipeline overview + +```mermaid +flowchart TD + GENOMES["genomes/*.fna.gz"] + BIN["obikmer binary"] + + GENOMES --> simulate + simulate --> simdata[("simulated_data/")] + + simdata --> reference + reference --> refnpz[("reference_index/*.npz")] + + subgraph presence ["Presence track"] + simdata --> index_presence + BIN --> index_presence + index_presence --> pres_done[("specimen_index_presence/")] + index_presence --> pres_istats[("stats/indexing_presence/")] + pres_istats --> aggregate_index_presence + + pres_done --> merge_presence + BIN --> merge_presence + merge_presence --> gpres[("global_index_presence/")] + + refnpz --> verify_presence + pres_done --> verify_presence + verify_presence --> vpres_stats[("stats/verify_presence/")] + vpres_stats --> aggregate_verify_presence + + gpres --> filter_presence + BIN --> filter_presence + filter_presence --> spec_pres[("specific_index_presence/")] + filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")] + spec_pres_stats --> aggregate_filter_presence + + refnpz --> verify_merge_presence + gpres --> verify_merge_presence + verify_merge_presence --> vmp[("stats/verify_merge_presence/")] + end + + subgraph count ["Count track"] + simdata --> index_count + BIN --> index_count + index_count --> count_done[("specimen_index_count/")] + index_count --> count_istats[("stats/indexing_count/")] + count_istats --> aggregate_index_count + + count_done --> merge_count + BIN --> merge_count + merge_count --> gcount[("global_index_count/")] + + refnpz --> verify_count + count_done --> verify_count + verify_count --> vcount_stats[("stats/verify_count/")] + vcount_stats --> aggregate_verify_count + + gcount --> filter_count + BIN --> filter_count + filter_count --> spec_count[("specific_index_count/")] + filter_count --> spec_count_stats[("stats/specific_kmer_count/")] + spec_count_stats --> aggregate_filter_count + + refnpz --> verify_merge_count + gcount --> verify_merge_count + verify_merge_count --> vmc[("stats/verify_merge_count/")] + end + + aggregate_verify_presence --> all + aggregate_verify_count --> all + vmp --> all + vmc --> all + all -. "$(MAKE) re-eval" .-> aggregate_filter_presence + all -. "$(MAKE) re-eval" .-> aggregate_filter_count +``` + +## Steps + +| Target | Script | Description | +|---|---|---| +| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes | +| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth | +| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) | +| `index_count` | `index_one_count.sh` | Index each specimen (count mode) | +| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) | +| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) | +| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index | +| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index | +| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference | +| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference | +| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) | +| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) | +| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index | +| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index | +| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) | +| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) | +| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets | +| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets | + +## Directory layout + +``` +benchmark/ +├── genomes/ # input reference genomes (.fna.gz) +├── simulated_data/ # generated by simulate +│ └── // +├── reference_index/ # reference kmer sets (.npz) +├── specimen_index_presence/ # per-specimen presence indexes +├── specimen_index_count/ # per-specimen count indexes +├── global_index_presence/ # merged global presence index +├── global_index_count/ # merged global count index +├── specific_index_presence/ # species-specific presence indexes +├── specific_index_count/ # species-specific count indexes +└── stats/ # all benchmark statistics + ├── indexing_presence/ + ├── indexing_count/ + ├── verify_presence/ + ├── verify_count/ + ├── specific_kmer_presence/ + ├── specific_kmer_count/ + ├── verify_merge_presence/ + └── verify_merge_count/ +``` diff --git a/benchmark/aggregate_stats.sh b/benchmark/aggregate_stats.sh new file mode 100755 index 0000000..19901bb --- /dev/null +++ b/benchmark/aggregate_stats.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +# Usage: aggregate_stats.sh TYPE +# TYPE = indexing_presence | indexing_count | verify_presence | verify_count +# +# Reads all stats/TYPE/*.stats files (one CSV data row each, no header). +# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than +# the most recent run CSV (idempotent when nothing changed). +set -euo pipefail + +TYPE="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}" + +case "${TYPE}" in + indexing_presence|indexing_count) + HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b" + ;; + verify_presence) + HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct" + ;; + verify_count) + HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct" + ;; + specific_kmer_presence|specific_kmer_count) + HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b" + ;; + *) + echo "ERROR: unknown stats type '${TYPE}'" >&2 + exit 1 + ;; +esac + +# Find most recent existing run CSV (empty string if none). +latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1) + +# Check if any .stats file is newer than the latest run CSV. +if [[ -n "${latest_csv}" ]] && \ + [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then + echo "[${TYPE}] stats up to date (${latest_csv})" + exit 0 +fi + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +echo "${HEADER}" >"${CSV}" + +# Sort .stats files by name for reproducible row order. +while IFS= read -r stats_file; do + sed "s/^/${run_n},/" "${stats_file}" +done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}" + +echo "[${TYPE}] run ${run_n} → ${CSV}" diff --git a/benchmark/build_reference.py b/benchmark/build_reference.py new file mode 100755 index 0000000..eddd3da --- /dev/null +++ b/benchmark/build_reference.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +"""Build a reference kmer index from paired-end FASTQ reads. + +Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 — +counts their abundances, and saves a sorted numpy pair (kmers, counts). + +Output .npz arrays + kmers : uint64, sorted ascending — canonical kmer integers + counts : uint32, same order — raw read abundances +""" +import argparse +import gzip +import sys +from collections import defaultdict + +import numpy as np + + +# ── encoding ──────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +# Lookup table: revcomp of one byte (4 bases, 8 bits). +# Precomputed once at import time. +_REVCOMP8 = [0] * 256 +for _i in range(256): + _rc, _x = 0, _i + for _ in range(4): + _rc = (_rc << 2) | (3 - (_x & 3)) + _x >>= 2 + _REVCOMP8[_i] = _rc +del _i, _rc, _x + + +def revcomp_int(kmer: int, k: int) -> int: + """Reverse-complement of a kmer encoded as an integer (2 bits/base). + + Uses byte-level lookup (4 bases at a time) for speed. + """ + rc = 0 + bits_left = 2 * k + while bits_left > 0: + chunk = min(8, bits_left) + rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk) + rc = (rc << chunk) | rc_byte + kmer >>= chunk + bits_left -= chunk + return rc + + +# ── FASTQ parsing ──────────────────────────────────────────────────────────── + +def iter_sequences(path: str): + """Yield raw sequences from a (gzipped) FASTQ file.""" + opener = gzip.open if path.endswith('.gz') else open + with opener(path, 'rt') as fh: + while True: + if not fh.readline(): # '@' header + break + seq = fh.readline().rstrip('\n') + fh.readline() # '+' + fh.readline() # quality + yield seq + + +# ── kmer counting ──────────────────────────────────────────────────────────── + +def count_kmers(paths: list[str], k: int) -> dict[int, int]: + mask = (1 << (2 * k)) - 1 + counts: dict[int, int] = defaultdict(int) + n_reads = 0 + + for path in paths: + for seq in iter_sequences(path): + n_reads += 1 + kmer = 0 + run = 0 # consecutive valid bases + + for c in seq: + b = _ENCODE.get(c) + if b is None: # N or unexpected character → reset + kmer = 0 + run = 0 + continue + kmer = ((kmer << 2) | b) & mask + run += 1 + if run >= k: + rc = revcomp_int(kmer, k) + counts[kmer if kmer <= rc else rc] += 1 + + if n_reads % 100_000 == 0: + print(f' {n_reads:,} reads processed, ' + f'{len(counts):,} distinct kmers so far', + file=sys.stderr) + + print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers', + file=sys.stderr) + return counts + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reads', nargs='+', metavar='FASTQ', + help='Input reads (FASTQ, gzip OK)') + ap.add_argument('-k', '--kmer-size', type=int, default=31, + metavar='K') + ap.add_argument('--min-abundance', type=int, default=1, + metavar='N', help='Drop kmers with count < N (default 1)') + ap.add_argument('-o', '--output', required=True, + metavar='FILE', help='Output .npz path') + args = ap.parse_args() + + print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr) + counts = count_kmers(args.reads, args.kmer_size) + + if args.min_abundance > 1: + before = len(counts) + counts = {k: v for k, v in counts.items() if v >= args.min_abundance} + print(f' min-abundance={args.min_abundance}: ' + f'{before - len(counts):,} kmers dropped, ' + f'{len(counts):,} retained', + file=sys.stderr) + + print(f'Sorting and saving → {args.output}', file=sys.stderr) + kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts)) + counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32) + + np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr) + print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/benchmark/build_reference.sh b/benchmark/build_reference.sh new file mode 100755 index 0000000..3d312c1 --- /dev/null +++ b/benchmark/build_reference.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SIMDATA_DIR="${SCRIPT_DIR}/simulated_data" +REF_DIR="${SCRIPT_DIR}/reference_index" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +BUILD_PY="${SCRIPT_DIR}/build_reference.py" + +KMER_SIZE="${KMER_SIZE:-31}" +MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}" + +mkdir -p "${REF_DIR}" + +for species_dir in "${SIMDATA_DIR}"/*/; do + [[ -d "${species_dir}" ]] || continue + species=$(basename "${species_dir}") + + for strain_dir in "${species_dir}"*/; do + [[ -d "${strain_dir}" ]] || continue + strain=$(basename "${strain_dir}") + + r1="${strain_dir}/reads_R1.fastq.gz" + r2="${strain_dir}/reads_R2.fastq.gz" + if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "SKIP ${species}--${strain}: reads not found" >&2 + continue + fi + + out="${REF_DIR}/${species}--${strain}.npz" + echo "[${species}--${strain}] → ${out}" + + "${PYTHON}" "${BUILD_PY}" \ + --kmer-size "${KMER_SIZE}" \ + --min-abundance "${MIN_ABUNDANCE}" \ + --output "${out}" \ + "${r1}" "${r2}" + done +done diff --git a/benchmark/deps.mk b/benchmark/deps.mk new file mode 100644 index 0000000..031dd59 --- /dev/null +++ b/benchmark/deps.mk @@ -0,0 +1,199 @@ +SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2 +SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris + +# Escherichia_coli--K-12_MG1655 +simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz +reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done +stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done + +# Escherichia_coli--EDL933 +simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz +reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done +stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done + +# Salmonella_enterica--LT2 +simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz +reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done +stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done + +# Escherichia_coli--CFT073 +simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz +reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done +stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done + +# Bacillus_subtilis--168 +simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz +reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz +stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done +stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done + +# Salmonella_enterica--P125109 +simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz +reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done +stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done + +# Shouchella_clausii--KSM-K16 +simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz +reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz +stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done +stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done + +# Escherichia_coli--K-12_W3110 +simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz +reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz +stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done +stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done + +# Klebsiella_pneumoniae--MGH_78578 +simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done +stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done + +# Opitutus_terrae--PB90-1 +simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz +reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz +stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done +stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done + +# Saccharolobus_islandicus--M.16.4 +simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz +reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz +stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done +stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done + +# Acidobacterium_capsulatum--ATCC_51196 +simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz +reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz +stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done +stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done + +# Salmonella_enterica--AKU_12601 +simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz +reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done +stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done + +# Proteus_mirabilis--HI4320 +simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz +reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz +stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done +stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done + +# Salmonella_enterica--CT18 +simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz +reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz +stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done +stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done + +# Klebsiella_pneumoniae--HS11286 +simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done +stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done + +# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 +simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz +reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz +stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done +stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done + +# Klebsiella_pneumoniae--ATCC_13883 +simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz +reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz +stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done +stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done + +# Yersinia_ruckeri--YRB +simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz +reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz +stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done +stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done + +# Candidozyma_auris--GCF_003013715.1_ASM301371v2 +simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz +reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz +stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done +stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done + +# Escherichia_coli +specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done +specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done +# Salmonella_enterica +specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done +specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done +# Bacillus_subtilis +specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done +specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done +# Shouchella_clausii +specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done +specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done +# Klebsiella_pneumoniae +specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done +specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done +# Opitutus_terrae +specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done +specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done +# Saccharolobus_islandicus +specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done +specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done +# Acidobacterium_capsulatum +specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done +specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done +# Proteus_mirabilis +specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done +specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done +# Wolbachia_endosymbiont +specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done +specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done +# Yersinia_ruckeri +specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done +specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done +# Candidozyma_auris +specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done +specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done diff --git a/benchmark/downloads.sh b/benchmark/downloads.sh new file mode 100755 index 0000000..d86111e --- /dev/null +++ b/benchmark/downloads.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -euo pipefail + +assemblies=( + GCF_000005845.2 + GCF_000010245.2 + GCF_000007445.1 + GCF_000006665.1 + + GCF_000006945.2 + GCF_000195995.1 + GCF_000009505.1 + GCF_000026565.1 + + GCF_000016305.1 + GCF_000019965.1 + GCF_000240185.1 + GCF_000742135.1 + + GCF_000069965.1 + GCF_000022565.1 + GCF_000306885.1 + GCF_003013715.1 + + GCF_000009045.1 + GCF_000009825.1 + GCF_000022445.1 + GCF_000834255.1 +) + +mkdir -p genomes + +for acc in "${assemblies[@]}"; do + echo "Downloading ${acc}" + + datasets download genome accession "${acc}" \ + --include genome \ + --filename "${acc}.zip" + + unzip -q "${acc}.zip" -d "${acc}" + find "${acc}" -name "*.fna" | + while read file; do + obiconvert -Z ${file} >genomes/$(basename ${file}).gz + done + + rm -rf "${acc}" "${acc}.zip" +done diff --git a/benchmark/filter_one_count.sh b/benchmark/filter_one_count.sh new file mode 100755 index 0000000..115ed3c --- /dev/null +++ b/benchmark/filter_one_count.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# Usage: filter_one_count.sh SPECIES +# Filters global_index_count to keep only kmers specific to SPECIES, +# then selects the SPECIES column in-place. +# Outputs: +# specific_index_count/SPECIES/index.done (written by obikmer select) +# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header) +set -euo pipefail + +SPECIES="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +SOURCE="${SCRIPT_DIR}/global_index_count" +OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}" +STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count" +STATS_FILE="${STATS_DIR}/${SPECIES}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIES}] filter (count) → ${OUTPUT}" + +LOG_FILTER=$(mktemp) +LOG_SELECT=$(mktemp) +trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT + +"${BINARY}" filter \ + --output "${OUTPUT}" \ + --force \ + --ingroup "species=${SPECIES}" \ + --outgroup all \ + --min-frac 0.5 \ + --max-frac 1.0 \ + --max-outgroup-count 0 \ + "${SOURCE}" \ + 2>"${LOG_FILTER}" + +cat "${LOG_FILTER}" >&2 + +"${BINARY}" select \ + --in-place \ + --group "${SPECIES}:species=${SPECIES}" \ + --group-op "${SPECIES}:any" \ + --select "${SPECIES}" \ + "${OUTPUT}" \ + 2>"${LOG_SELECT}" + +cat "${LOG_SELECT}" >&2 + +python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +def parse_reporter(logfile): + stats = {} + state = 'scan' + with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats['TOTAL'] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + return stats + +f = parse_reporter(log_filter) +s = parse_reporter(log_select) + +row = [species] +for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]: + key = 'TOTAL' if stage.endswith('_total') else stage + w, r = d.get(key, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +print(','.join(row)) +PYEOF diff --git a/benchmark/filter_one_presence.sh b/benchmark/filter_one_presence.sh new file mode 100755 index 0000000..12099ce --- /dev/null +++ b/benchmark/filter_one_presence.sh @@ -0,0 +1,108 @@ +#!/usr/bin/env bash +# Usage: filter_one_presence.sh SPECIES +# Filters global_index_presence to keep only kmers specific to SPECIES, +# then selects the SPECIES column in-place. +# Outputs: +# specific_index_presence/SPECIES/index.done (written by obikmer select) +# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header) +set -euo pipefail + +SPECIES="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +SOURCE="${SCRIPT_DIR}/global_index_presence" +OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}" +STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence" +STATS_FILE="${STATS_DIR}/${SPECIES}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIES}] filter (presence) → ${OUTPUT}" + +LOG_FILTER=$(mktemp) +LOG_SELECT=$(mktemp) +trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT + +"${BINARY}" filter \ + --output "${OUTPUT}" \ + --force \ + --ingroup "species=${SPECIES}" \ + --outgroup all \ + --min-frac 0.5 \ + --max-frac 1.0 \ + --max-outgroup-count 0 \ + "${SOURCE}" \ + 2>"${LOG_FILTER}" + +cat "${LOG_FILTER}" >&2 + +"${BINARY}" select \ + --in-place \ + --group "${SPECIES}:species=${SPECIES}" \ + --group-op "${SPECIES}:any" \ + --select "${SPECIES}" \ + "${OUTPUT}" \ + 2>"${LOG_SELECT}" + +cat "${LOG_SELECT}" >&2 + +python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +def parse_reporter(logfile): + stats = {} + state = 'scan' + with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats['TOTAL'] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + return stats + +f = parse_reporter(log_filter) +s = parse_reporter(log_select) + +row = [species] +for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]: + key = 'TOTAL' if stage.endswith('_total') else stage + w, r = d.get(key, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +print(','.join(row)) +PYEOF diff --git a/benchmark/index_one_count.sh b/benchmark/index_one_count.sh new file mode 100755 index 0000000..325ec7f --- /dev/null +++ b/benchmark/index_one_count.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# Usage: index_one_count.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Outputs: +# specimen_index_count/SPECIMEN/index.done (written by obikmer) +# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}" +INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/indexing_count" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +r1="${READS_DIR}/reads_R1.fastq.gz" +r2="${READS_DIR}/reads_R2.fastq.gz" +if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "ERROR: reads not found in ${READS_DIR}" >&2 + exit 1 +fi + +echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" index \ + --output "${INDEX_PATH}" \ + --force \ + --theta 0 \ + --with-counts \ + --label "${SPECIMEN}" \ + --meta "species=${species}" \ + "${r1}" "${r2}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 + +python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index'] +row = [species, strain] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF diff --git a/benchmark/index_one_presence.sh b/benchmark/index_one_presence.sh new file mode 100755 index 0000000..029c537 --- /dev/null +++ b/benchmark/index_one_presence.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Usage: index_one_presence.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Outputs: +# specimen_index_presence/SPECIMEN/index.done (written by obikmer) +# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}" +INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +r1="${READS_DIR}/reads_R1.fastq.gz" +r2="${READS_DIR}/reads_R2.fastq.gz" +if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then + echo "ERROR: reads not found in ${READS_DIR}" >&2 + exit 1 +fi + +echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" index \ + --output "${INDEX_PATH}" \ + --force \ + --theta 0 \ + --label "${SPECIMEN}" \ + --meta "species=${species}" \ + "${r1}" "${r2}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 + +python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}" +import sys, re + +species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): state = 'rows' + elif state == 'rows': + if is_sep(s): state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index'] +row = [species, strain] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF diff --git a/benchmark/make_deps.py b/benchmark/make_deps.py new file mode 100644 index 0000000..03f7e2a --- /dev/null +++ b/benchmark/make_deps.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Generate deps.mk — pure dependency declarations for the benchmark pipeline. + +Like C .d files: only target: prerequisites lines, no recipes. +Recipes stay in the Makefile as generic rules. +""" +import gzip +import re +import sys +from pathlib import Path + +STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome', + 'endosymbiont', 'of'} +STOP_PREFIXES = ('scaffold', 'contig', 'plasmid') + + +def is_stop(tok): + t = tok.lower() + return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES) + + +def sanitize(s): + return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_') + + +def collect_tokens(text): + parts = [] + for tok in text.split(): + tok = tok.rstrip(',.') + if is_stop(tok): + break + parts.append(sanitize(tok)) + return '_'.join(filter(None, parts)) + + +def parse_organism(defn, gcf_id): + words = defn.split() + species = sanitize(words[0] + '_' + words[1]) + + m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn) + if m: + strain = sanitize(m.group(1)) + if m.group(2): + strain += '_' + sanitize(m.group(2)) + return species, strain + + m = re.search(r'\bstrain\b\s+(.*)', defn) + if m: + strain = collect_tokens(m.group(1)) + if strain: + return species, strain + + remainder = re.sub(r'^\S+ \S+\s*', '', defn) + remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder) + remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder) + strain = collect_tokens(remainder) + return species, strain if strain else gcf_id + + +def first_definition(path): + with gzip.open(path, 'rt') as fh: + for line in fh: + if line.startswith('>'): + m = re.search(r'"definition":"([^"]*)"', line) + return m.group(1) if m else line[1:].split()[0] + return Path(path).stem + + +def main(): + entries = [] # (specimen, species, sim_dir, genome_path) + species_seen = [] + + for path in sorted(sys.argv[1:]): + gcf_id = Path(path).name.replace('_genomic.fna.gz', '') + defn = first_definition(path) + sp, st = parse_organism(defn, gcf_id) + specimen = f'{sp}--{st}' + sim_dir = f'simulated_data/{sp}/{st}' + entries.append((specimen, sp, sim_dir, path)) + if sp not in species_seen: + species_seen.append(sp) + + specimens = [e[0] for e in entries] + print('SPECIMENS :=', ' '.join(specimens)) + print('SPECIES :=', ' '.join(species_seen)) + + for specimen, species, sim_dir, genome in entries: + reads = f'{sim_dir}/reads_R1.fastq.gz' + p_done = f'specimen_index_presence/{specimen}/index.done' + p_stats = f'stats/indexing_presence/{specimen}.stats' + c_done = f'specimen_index_count/{specimen}/index.done' + c_stats = f'stats/indexing_count/{specimen}.stats' + ref = f'reference_index/{specimen}.npz' + vp = f'stats/verify_presence/{specimen}.stats' + vc = f'stats/verify_count/{specimen}.stats' + + print() + print(f'# {specimen}') + print(f'{reads}: {genome}') + print(f'{ref}: {reads}') + print(f'{p_done} {p_stats}: {reads}') + print(f'{c_done} {c_stats}: {reads}') + print(f'{vp}: {ref} {p_done}') + print(f'{vc}: {ref} {c_done}') + + print() + for sp in species_seen: + sp_done = f'specific_index_presence/{sp}/index.done' + sp_stats = f'stats/specific_kmer_presence/{sp}.stats' + sc_done = f'specific_index_count/{sp}/index.done' + sc_stats = f'stats/specific_kmer_count/{sp}.stats' + print(f'# {sp}') + print(f'{sp_done} {sp_stats}: global_index_presence/index.done') + print(f'{sc_done} {sc_stats}: global_index_count/index.done') + + +if __name__ == '__main__': + main() diff --git a/benchmark/merge_count.sh b/benchmark/merge_count.sh new file mode 100755 index 0000000..871b436 --- /dev/null +++ b/benchmark/merge_count.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +IDX_DIR="${SCRIPT_DIR}/specimen_index_count" +OUTPUT="${SCRIPT_DIR}/global_index_count" +STATS_DIR="${SCRIPT_DIR}/stats/merge_count" + +mkdir -p "${STATS_DIR}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}" + +parse_reporter() { + local run="$1" n_sources="$2" logfile="$3" + python3 - "$run" "$n_sources" "$logfile" <<'PYEOF' +import sys, re + +run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): + state = 'rows' + elif state == 'rows': + if is_sep(s): + state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack'] +row = [run, n_sources] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF +} + +mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort) + +if [[ ${#sources[@]} -eq 0 ]]; then + echo "ERROR: no indexes found in ${IDX_DIR}" >&2 + exit 1 +fi + +echo "Merging ${#sources[@]} count indexes → ${OUTPUT}" +printf ' %s\n' "${sources[@]}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" merge \ + --output "${OUTPUT}" \ + --force \ + "${sources[@]}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 +parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}" + +echo "Done. Run ${run_n} → ${CSV}" diff --git a/benchmark/merge_presence.sh b/benchmark/merge_presence.sh new file mode 100755 index 0000000..7a816d1 --- /dev/null +++ b/benchmark/merge_presence.sh @@ -0,0 +1,104 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +IDX_DIR="${SCRIPT_DIR}/specimen_index_presence" +OUTPUT="${SCRIPT_DIR}/global_index_presence" +STATS_DIR="${SCRIPT_DIR}/stats/merge_presence" + +mkdir -p "${STATS_DIR}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')") +CSV="${STATS_DIR}/run_${run_n}.csv" + +printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}" + +parse_reporter() { + local run="$1" n_sources="$2" logfile="$3" + python3 - "$run" "$n_sources" "$logfile" <<'PYEOF' +import sys, re + +run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3] + +def strip_ansi(s): + return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) + +def parse_wall(s): + s = s.strip() + if s.endswith('ms'): return float(s[:-2]) / 1000.0 + if s.endswith('s'): return float(s[:-1]) + return 0.0 + +def parse_rss(s): + m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) + if not m: return 0 + return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) + +def is_sep(s): + return bool(s) and not re.search(r'[A-Za-z0-9]', s) + +stats = {} +state = 'scan' + +with open(logfile, errors='replace') as fh: + for raw in fh: + line = strip_ansi(raw.rstrip('\n')) + s = line.strip() + + if state == 'scan': + if re.search(r'\bstage\b.*\bwall\b', line): + state = 'in_header' + elif state == 'in_header': + if is_sep(s): + state = 'rows' + elif state == 'rows': + if is_sep(s): + state = 'total' + elif s: + parts = re.split(r' +', s) + if len(parts) >= 4: + stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) + elif state == 'total': + if s: + parts = re.split(r' +', s) + if len(parts) >= 3: + stats[parts[0]] = (parse_wall(parts[1]), + parse_rss(parts[3]) if len(parts) > 3 else 0) + break + +STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack'] +row = [run, n_sources] +for stage in STAGE_ORDER: + w, r = stats.get(stage, ('', '')) + row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] +tw, tr = stats.get('TOTAL', ('', '')) +row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] +print(','.join(row)) +PYEOF +} + +mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort) + +if [[ ${#sources[@]} -eq 0 ]]; then + echo "ERROR: no indexes found in ${IDX_DIR}" >&2 + exit 1 +fi + +echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}" +printf ' %s\n' "${sources[@]}" + +STDERR_LOG=$(mktemp) +trap 'rm -f "${STDERR_LOG}"' EXIT + +"${BINARY}" merge \ + --output "${OUTPUT}" \ + --force \ + --force-presence \ + "${sources[@]}" \ + 2>"${STDERR_LOG}" + +cat "${STDERR_LOG}" >&2 +parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}" + +echo "Done. Run ${run_n} → ${CSV}" diff --git a/benchmark/simulate.sh b/benchmark/simulate.sh new file mode 100755 index 0000000..c486255 --- /dev/null +++ b/benchmark/simulate.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Simulate all genomes. Delegates to simulate_one.sh per genome. +# Prefer running via `gmake simulate` which handles individual dependencies. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do + out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \ + --dir-for "${genome_file}") + bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}" +done diff --git a/benchmark/simulate_one.sh b/benchmark/simulate_one.sh new file mode 100644 index 0000000..d4c4c1a --- /dev/null +++ b/benchmark/simulate_one.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# Usage: simulate_one.sh genome.fna.gz output_dir +# Simulates paired-end HiSeq reads for a single genome. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ISS="${SCRIPT_DIR}/../.venv/bin/iss" +COVERAGE=15 +READ_LENGTH=150 +CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}" + +genome_file="$1" +out_dir="$2" + +mkdir -p "${out_dir}" + +tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna") +trap 'rm -f "${tmp_fasta}"' EXIT + +gzip -dc "${genome_file}" > "${tmp_fasta}" + +genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ') +n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))") + +echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)" + +"${ISS}" generate \ + --genomes "${tmp_fasta}" \ + --model HiSeq \ + --n_reads "${n_reads}" \ + --cpus "${CPUS}" \ + --compress \ + --output "${out_dir}/reads" diff --git a/benchmark/verify_count.py b/benchmark/verify_count.py new file mode 100755 index 0000000..0b204e0 --- /dev/null +++ b/benchmark/verify_count.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Compare an obikmer count index against a reference kmer set (presence + counts). + +Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py), +streams `obikmer dump` from a --with-counts index, then reports: + - false negatives : kmers in reference absent from the index + - false positives : kmers in the index absent from the reference + - count mismatches: kmers present in both but with differing counts + +Output to stdout: one CSV row + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch, + fn_pct,fp_pct,cm_pct +""" +import argparse +import subprocess +import sys + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── dump parsing ────────────────────────────────────────────────────────────── + +def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]: + """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32).""" + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + kmers, counts = [], [] + header = True + for line in proc.stdout: + if header: + header = False + continue + parts = line.rstrip('\n').split(',') + kmers.append(encode_kmer(parts[0])) + counts.append(int(parts[1])) + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable') + return (np.array(kmers, dtype=np.uint64)[order], + np.array(counts, dtype=np.uint32)[order]) + + +# ── comparison ──────────────────────────────────────────────────────────────── + +def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray, + idx_kmers: np.ndarray, idx_counts: np.ndarray, + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts). + + All arrays sorted; cm_* cover kmers present in both arrays but with + differing counts. + """ + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + # Count mismatches among shared kmers. + # Both arrays are sorted so we can use searchsorted. + pos_in_idx = np.searchsorted(idx_kmers, ref_kmers) + pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1) + shared_mask = idx_kmers[pos_in_idx] == ref_kmers + + shared_ref_counts = ref_counts[shared_mask] + shared_idx_counts = idx_counts[pos_in_idx[shared_mask]] + mismatch_mask = shared_ref_counts != shared_idx_counts + + cm_kmers = ref_kmers[shared_mask][mismatch_mask] + cm_ref_counts = shared_ref_counts[mismatch_mask] + cm_idx_counts = shared_idx_counts[mismatch_mask] + + return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reference', metavar='REF_NPZ', nargs='?', + help='Reference .npz file') + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='obikmer index directory (built with --with-counts)') + ap.add_argument('--obikmer', default='obikmer', + help='Path to obikmer binary') + ap.add_argument('--species', default='') + ap.add_argument('--strain', default='') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fp', metavar='FILE', + help='Save false-positive kmer strings to FILE') + ap.add_argument('--save-fn', metavar='FILE', + help='Save false-negative kmer strings to FILE') + ap.add_argument('--save-cm', metavar='FILE', + help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,count_mismatch,' + 'fn_pct,fp_pct,cm_pct') + return + + # Detect k + cmd1 = [args.obikmer, 'dump', '--head', '1', args.index] + out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + # Load reference + print(f'Loading reference: {args.reference}', file=sys.stderr) + npz = np.load(args.reference) + ref_kmers = npz['kmers'] # sorted uint64 + ref_counts = npz['counts'] # uint32 + + # Load index + print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr) + idx_kmers, idx_counts = load_index(args.obikmer, args.index) + + print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr) + + false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare( + ref_kmers, ref_counts, idx_kmers, idx_counts) + + n_shared = len(ref_kmers) - len(false_neg) + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0 + + print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr) + print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr) + print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)', + file=sys.stderr) + + if args.save_fn and len(false_neg): + with open(args.save_fn, 'w') as fh: + for v in false_neg: + fh.write(decode_kmer(int(v), k) + '\n') + + if args.save_fp and len(false_pos): + with open(args.save_fp, 'w') as fh: + for v in false_pos: + fh.write(decode_kmer(int(v), k) + '\n') + + if args.save_cm and len(cm_kmers): + with open(args.save_cm, 'w') as fh: + fh.write('kmer,ref_count,idx_count\n') + for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx): + fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n') + + print(f'{args.species},{args.strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},' + f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}') + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_count.py b/benchmark/verify_merge_count.py new file mode 100755 index 0000000..72518a1 --- /dev/null +++ b/benchmark/verify_merge_count.py @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 +"""Verify the merged count index against all per-specimen reference sets. + +Streams `obikmer dump` once on the merged index, accumulates per-specimen +kmer+count pairs from each column, then compares each against its reference .npz. + +Output to stdout: one CSV row per specimen (same columns as verify_count.py) + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch, + fn_pct,fp_pct,cm_pct +""" +import argparse +import subprocess +import sys +from pathlib import Path + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── single-pass dump ────────────────────────────────────────────────────────── + +def stream_merged_dump(obikmer_bin: str, index_dir: str, + ) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]: + """Stream the merged dump once. + + Returns: + specimen_names : column labels in dump order + per_specimen : mapping label → (kmer_ints, counts) for entries > 0 + """ + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + + header_line = proc.stdout.readline().rstrip('\n') + cols = header_line.split(',') + specimen_names = cols[1:] + per_specimen: dict[str, tuple[list[int], list[int]]] = { + name: ([], []) for name in specimen_names} + + for line in proc.stdout: + parts = line.rstrip('\n').split(',') + kmer_int = encode_kmer(parts[0]) + for i, name in enumerate(specimen_names): + count = int(parts[i + 1]) + if count > 0: + per_specimen[name][0].append(kmer_int) + per_specimen[name][1].append(count) + + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + + return specimen_names, per_specimen + + +# ── per-specimen comparison ─────────────────────────────────────────────────── + +def compare_specimen(name: str, + kmer_list: list[int], + count_list: list[int], + ref_dir: Path, + k: int, + save_fn: Path | None, + save_fp: Path | None, + save_cm: Path | None, + ) -> str: + ref_path = ref_dir / f'{name}.npz' + if not ref_path.exists(): + print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr) + return '' + + species = name.split('--')[0] + strain = name[len(species) + 2:] + + npz = np.load(ref_path) + ref_kmers = npz['kmers'] # sorted uint64 + ref_counts = npz['counts'] # uint32 + + order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable') + idx_kmers = np.array(kmer_list, dtype=np.uint64)[order] + idx_counts = np.array(count_list, dtype=np.uint32)[order] + + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + # Count mismatches among shared kmers + pos_in_idx = np.searchsorted(idx_kmers, ref_kmers) + pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1) + shared_mask = idx_kmers[pos_in_idx] == ref_kmers + mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]] + cm_kmers = ref_kmers[shared_mask][mismatch_mask] + cm_ref = ref_counts[shared_mask][mismatch_mask] + cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask] + + n_shared = int(shared_mask.sum()) + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0 + + print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} ' + f'fn={len(false_neg):,} ({fn_pct:.4f}%) ' + f'fp={len(false_pos):,} ({fp_pct:.4f}%) ' + f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)', + file=sys.stderr) + + if save_fn and len(false_neg): + fn_file = save_fn / f'{name}_fn.txt' + fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n') + + if save_fp and len(false_pos): + fp_file = save_fp / f'{name}_fp.txt' + fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n') + + if save_cm and len(cm_kmers): + cm_file = save_cm / f'{name}_cm.csv' + lines = ['kmer,ref_count,idx_count'] + for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx): + lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}') + cm_file.write_text('\n'.join(lines) + '\n') + + return (f'{species},{strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},' + f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}') + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='Merged count index directory') + ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?', + help='Directory containing per-specimen .npz reference files') + ap.add_argument('--obikmer', default='obikmer') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fn', metavar='DIR', + help='Directory for false-negative kmer lists') + ap.add_argument('--save-fp', metavar='DIR', + help='Directory for false-positive kmer lists') + ap.add_argument('--save-cm', metavar='DIR', + help='Directory for count-mismatch CSV files') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,count_mismatch,' + 'fn_pct,fp_pct,cm_pct') + return + + ref_dir = Path(args.ref_dir) + save_fn = Path(args.save_fn) if args.save_fn else None + save_fp = Path(args.save_fp) if args.save_fp else None + save_cm = Path(args.save_cm) if args.save_cm else None + for d in (save_fn, save_fp, save_cm): + if d: d.mkdir(parents=True, exist_ok=True) + + out1 = subprocess.check_output( + [args.obikmer, 'dump', '--head', '1', args.index], + stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr) + specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index) + print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr) + + for name in specimen_names: + kmers, counts = per_specimen[name] + row = compare_specimen(name, kmers, counts, ref_dir, k, + save_fn, save_fp, save_cm) + if row: + print(row) + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_count.sh b/benchmark/verify_merge_count.sh new file mode 100755 index 0000000..ebf4c36 --- /dev/null +++ b/benchmark/verify_merge_count.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +INDEX="${SCRIPT_DIR}/global_index_count" +REF_DIR="${SCRIPT_DIR}/reference_index" +STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py" + +mkdir -p "${STATS_DIR}" + +CURRENT="${STATS_DIR}/current.csv" + +"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + "${INDEX}" "${REF_DIR}" \ + >>"${CURRENT}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')") +ARCHIVE="${STATS_DIR}/count_${run_n}.csv" +cp "${CURRENT}" "${ARCHIVE}" + +echo "Done. Results → ${ARCHIVE}" diff --git a/benchmark/verify_merge_presence.py b/benchmark/verify_merge_presence.py new file mode 100755 index 0000000..66fc12c --- /dev/null +++ b/benchmark/verify_merge_presence.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +"""Verify the merged presence index against all per-specimen reference sets. + +Streams `obikmer dump` once on the merged index, accumulates per-specimen +kmer sets from each column, then compares each against its reference .npz. + +Output to stdout: one CSV row per specimen (same columns as verify_presence.py) + species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct +""" +import argparse +import subprocess +import sys +from pathlib import Path + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── single-pass dump ────────────────────────────────────────────────────────── + +def stream_merged_dump(obikmer_bin: str, index_dir: str, + ) -> tuple[list[str], dict[str, list[int]]]: + """Stream the merged dump once. + + Returns: + specimen_names : column labels in dump order (excluding 'kmer') + per_specimen : mapping label → list of kmer ints where presence > 0 + """ + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + + header_line = proc.stdout.readline().rstrip('\n') + cols = header_line.split(',') + specimen_names = cols[1:] # first col is 'kmer' + per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names} + + for line in proc.stdout: + parts = line.rstrip('\n').split(',') + kmer_int = encode_kmer(parts[0]) + for i, name in enumerate(specimen_names): + if int(parts[i + 1]) > 0: + per_specimen[name].append(kmer_int) + + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + + return specimen_names, per_specimen + + +# ── per-specimen comparison ─────────────────────────────────────────────────── + +def compare_specimen(name: str, + kmer_list: list[int], + ref_dir: Path, + k: int, + save_fn: Path | None, + save_fp: Path | None, + ) -> str: + """Compare one specimen column against its reference .npz. + + Returns a CSV row string. + """ + ref_path = ref_dir / f'{name}.npz' + if not ref_path.exists(): + print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr) + return '' + + species = name.split('--')[0] + strain = name[len(species) + 2:] + + ref_kmers = np.load(ref_path)['kmers'] # sorted uint64 + idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64) + + false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) + false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) + + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + + print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} ' + f'fn={len(false_neg):,} ({fn_pct:.4f}%) ' + f'fp={len(false_pos):,} ({fp_pct:.4f}%)', + file=sys.stderr) + + if save_fn and len(false_neg): + fn_file = save_fn / f'{name}_fn.txt' + fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n') + + if save_fp and len(false_pos): + fp_file = save_fp / f'{name}_fp.txt' + fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n') + + return (f'{species},{strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},' + f'{fn_pct:.4f},{fp_pct:.4f}') + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', + help='Merged presence index directory') + ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?', + help='Directory containing per-specimen .npz reference files') + ap.add_argument('--obikmer', default='obikmer') + ap.add_argument('--header', action='store_true', + help='Print CSV header and exit') + ap.add_argument('--save-fn', metavar='DIR', + help='Directory to save false-negative kmer lists') + ap.add_argument('--save-fp', metavar='DIR', + help='Directory to save false-positive kmer lists') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,fn_pct,fp_pct') + return + + ref_dir = Path(args.ref_dir) + save_fn = Path(args.save_fn) if args.save_fn else None + save_fp = Path(args.save_fp) if args.save_fp else None + if save_fn: save_fn.mkdir(parents=True, exist_ok=True) + if save_fp: save_fp.mkdir(parents=True, exist_ok=True) + + # Detect k + out1 = subprocess.check_output( + [args.obikmer, 'dump', '--head', '1', args.index], + stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr) + specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index) + print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr) + + for name in specimen_names: + row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp) + if row: + print(row) + + +if __name__ == '__main__': + main() diff --git a/benchmark/verify_merge_presence.sh b/benchmark/verify_merge_presence.sh new file mode 100755 index 0000000..bea5ddf --- /dev/null +++ b/benchmark/verify_merge_presence.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +INDEX="${SCRIPT_DIR}/global_index_presence" +REF_DIR="${SCRIPT_DIR}/reference_index" +STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py" + +mkdir -p "${STATS_DIR}" + +CURRENT="${STATS_DIR}/current.csv" + +"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + "${INDEX}" "${REF_DIR}" \ + >>"${CURRENT}" + +run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')") +ARCHIVE="${STATS_DIR}/presence_${run_n}.csv" +cp "${CURRENT}" "${ARCHIVE}" + +echo "Done. Results → ${ARCHIVE}" diff --git a/benchmark/verify_one_count.sh b/benchmark/verify_one_count.sh new file mode 100755 index 0000000..3dfb8d6 --- /dev/null +++ b/benchmark/verify_one_count.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Usage: verify_one_count.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_count.py" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz" +INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/verify_count" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIMEN}] verifying count" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + --species "${species}" \ + --strain "${strain}" \ + "${REF_NPZ}" "${INDEX_DIR}" \ + >"${STATS_FILE}" diff --git a/benchmark/verify_one_presence.sh b/benchmark/verify_one_presence.sh new file mode 100755 index 0000000..252a2c3 --- /dev/null +++ b/benchmark/verify_one_presence.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Usage: verify_one_presence.sh SPECIMEN +# SPECIMEN = "species--strain" (Make pattern stem) +# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header) +set -euo pipefail + +SPECIMEN="$1" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" +PYTHON="${SCRIPT_DIR}/../.venv/bin/python3" +VERIFY_PY="${SCRIPT_DIR}/verify_presence.py" + +species="${SPECIMEN%%--*}" +strain="${SPECIMEN#*--}" + +REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz" +INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}" +STATS_DIR="${SCRIPT_DIR}/stats/verify_presence" +STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" + +mkdir -p "${STATS_DIR}" + +echo "[${SPECIMEN}] verifying presence" + +"${PYTHON}" "${VERIFY_PY}" \ + --obikmer "${BINARY}" \ + --species "${species}" \ + --strain "${strain}" \ + "${REF_NPZ}" "${INDEX_DIR}" \ + >"${STATS_FILE}" diff --git a/benchmark/verify_presence.py b/benchmark/verify_presence.py new file mode 100755 index 0000000..7041dd5 --- /dev/null +++ b/benchmark/verify_presence.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +"""Compare an obikmer index against a reference kmer set (presence/absence). + +Loads the reference .npz (sorted uint64 kmers built by build_reference.py), +streams the output of `obikmer dump`, encodes each kmer string to uint64, +then reports false negatives and false positives using numpy set operations. + +Output to stdout: one CSV row + species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct +""" +import argparse +import subprocess +import sys + +import numpy as np + + +# ── encoding ────────────────────────────────────────────────────────────────── + +_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, + 'a': 0, 'c': 1, 'g': 2, 't': 3} + +_DECODE = ['A', 'C', 'G', 'T'] + + +def encode_kmer(s: str) -> int: + kmer = 0 + for c in s: + kmer = (kmer << 2) | _ENCODE[c] + return kmer + + +def decode_kmer(val: int, k: int) -> str: + bases = [] + for _ in range(k): + bases.append(_DECODE[val & 3]) + val >>= 2 + return ''.join(reversed(bases)) + + +# ── dump parsing ────────────────────────────────────────────────────────────── + +def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray: + """Stream `obikmer dump` and return a sorted uint64 array of kmer integers.""" + cmd = [obikmer_bin, 'dump', index_dir] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, + text=True) + kmers = [] + header = True + for line in proc.stdout: + if header: + header = False + continue + kmer_str = line.split(',', 1)[0] + kmers.append(encode_kmer(kmer_str)) + proc.wait() + if proc.returncode != 0: + print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) + sys.exit(1) + arr = np.array(kmers, dtype=np.uint64) + arr.sort() + return arr + + +# ── comparison ──────────────────────────────────────────────────────────────── + +def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Return (false_negatives, false_positives) as uint64 arrays.""" + false_neg = np.setdiff1d(ref, idx, assume_unique=True) + false_pos = np.setdiff1d(idx, ref, assume_unique=True) + return false_neg, false_pos + + +# ── main ───────────────────────────────────────────────────────────────────── + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file') + ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory') + ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary') + ap.add_argument('--species', default='', help='Species label for CSV row') + ap.add_argument('--strain', default='', help='Strain label for CSV row') + ap.add_argument('--header', action='store_true', help='Print CSV header and exit') + ap.add_argument('--save-fp', metavar='FILE', + help='Save false-positive kmer strings to FILE') + ap.add_argument('--save-fn', metavar='FILE', + help='Save false-negative kmer strings to FILE') + args = ap.parse_args() + + if args.header: + print('species,strain,ref_kmers,idx_kmers,' + 'false_neg,false_pos,fn_pct,fp_pct') + return + + # Detect k from the index (one cheap call before the full dump). + cmd1 = [args.obikmer, 'dump', '--head', '1', args.index] + out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True) + k = len(out1.splitlines()[1].split(',')[0]) + + # Load reference + print(f'Loading reference: {args.reference}', file=sys.stderr) + npz = np.load(args.reference) + ref_kmers = npz['kmers'] # already sorted uint64 + + # Load index + print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr) + idx_kmers = load_index_kmers(args.obikmer, args.index) + + print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr) + + false_neg, false_pos = compare(ref_kmers, idx_kmers) + + fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 + fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 + + print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr) + print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr) + + if args.save_fn and len(false_neg): + with open(args.save_fn, 'w') as fh: + for v in false_neg: + fh.write(decode_kmer(int(v), k) + '\n') + print(f'False negatives saved → {args.save_fn}', file=sys.stderr) + + if args.save_fp and len(false_pos): + with open(args.save_fp, 'w') as fh: + for v in false_pos: + fh.write(decode_kmer(int(v), k) + '\n') + print(f'False positives saved → {args.save_fp}', file=sys.stderr) + + print(f'{args.species},{args.strain},' + f'{len(ref_kmers)},{len(idx_kmers)},' + f'{len(false_neg)},{len(false_pos)},' + f'{fn_pct:.4f},{fp_pct:.4f}') + + +if __name__ == '__main__': + main() diff --git a/docmd/implementation/filtering.md b/docmd/implementation/filtering.md index 4dfab31..fe56bc9 100644 --- a/docmd/implementation/filtering.md +++ b/docmd/implementation/filtering.md @@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate. ### Path matching (`~` and `!~`) -Metadata values can represent hierarchical taxonomic paths such as +Metadata values can represent hierarchical concept paths such as `/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`. -- **Absolute pattern** (starts with `/`): the value must start with the pattern - at a segment boundary. - `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and +**Both the stored metadata value and the pattern must start with `/`.** +A pattern that does not start with `/` is rejected at parse time with an error. + +The value matches the pattern if it equals it exactly or starts with the pattern +followed by `/` (segment-boundary prefix): + +- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and `/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`. -- **Bare segment** (no leading `/`): the value must contain the pattern as an - exact path component anywhere. - `taxon~Betula` matches any path that has `Betula` as one of its segments. ### Missing metadata key → NA diff --git a/mkdocs.yml b/mkdocs.yml index c27d1a9..7973e78 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -53,6 +53,7 @@ nav: - Merge parallelism & memory: implementation/merge_parallelism.md - Kmer filtering: implementation/filtering.md - Select command: implementation/select.md + - obitaxonomy crate: implementation/obitaxonomy.md - Architecture: - Sequences: architecture/sequences/invariant.md - Kmer index: architecture/index_architecture.md diff --git a/src/Cargo.lock b/src/Cargo.lock index 2983231..a48a7fd 100644 --- a/src/Cargo.lock +++ b/src/Cargo.lock @@ -1853,6 +1853,10 @@ dependencies = [ "tracing", ] +[[package]] +name = "obitaxonomy" +version = "0.1.0" + [[package]] name = "object" version = "0.37.3" diff --git a/src/Cargo.toml b/src/Cargo.toml index 46a4f87..141df02 100644 --- a/src/Cargo.toml +++ b/src/Cargo.toml @@ -1,5 +1,5 @@ [workspace] resolver = "3" -members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"] +members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"] [profile.release] debug = 1 diff --git a/src/obicompactvec/src/bitvec.rs b/src/obicompactvec/src/bitvec.rs index 145bd63..ee7d6f7 100644 --- a/src/obicompactvec/src/bitvec.rs +++ b/src/obicompactvec/src/bitvec.rs @@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec { // ── BitIter ─────────────────────────────────────────────────────────────────── pub struct BitIter<'a> { - pub(crate) words: &'a [u64], - pub(crate) slot: usize, - pub(crate) n: usize, + words: &'a [u64], + slot: usize, + n: usize, } impl ExactSizeIterator for BitIter<'_> {} @@ -132,7 +132,7 @@ impl PersistentBitVecBuilder { Ok(Self { mmap, n, path: path.to_path_buf() }) } - pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result { + pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result { let file_size = HEADER_SIZE + n_bytes_for_words(n); let file = OpenOptions::new() .read(true).write(true).create(true).truncate(true) diff --git a/src/obicompactvec/src/lib.rs b/src/obicompactvec/src/lib.rs index 25a8032..9041ab7 100644 --- a/src/obicompactvec/src/lib.rs +++ b/src/obicompactvec/src/lib.rs @@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder; pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask}; pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix}; pub use layer_meta::LayerMeta; -pub use reader::PersistentCompactIntVec; -pub use tempbitvec::TempBitVec; -pub use tempintvec::TempCompactIntVec; +pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter}; +pub use tempbitvec::{TempBitVec, TempBitVecBuilder}; +pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder}; pub use traits::{BitPartials, ColumnWeights, CountPartials}; -pub use views::{BitSliceView, IntSliceView}; +pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter}; #[cfg(test)] #[path = "tests/mod.rs"] diff --git a/src/obicompactvec/src/tempbitvec.rs b/src/obicompactvec/src/tempbitvec.rs index 8bbec16..df1d436 100644 --- a/src/obicompactvec/src/tempbitvec.rs +++ b/src/obicompactvec/src/tempbitvec.rs @@ -43,27 +43,27 @@ impl TempBitVec { // ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ──────────────── -pub(crate) struct TempBitVecBuilder { +pub struct TempBitVecBuilder { builder: PersistentBitVecBuilder, temp: TempDir, } impl TempBitVecBuilder { - pub(crate) fn new(n: usize) -> io::Result { + pub fn new(n: usize) -> io::Result { let temp = TempDir::new()?; let path = temp.path().join("data.pbiv"); let builder = PersistentBitVecBuilder::new(n, &path)?; Ok(Self { builder, temp }) } - pub(crate) fn new_ones(n: usize) -> io::Result { + pub fn new_ones(n: usize) -> io::Result { let temp = TempDir::new()?; let path = temp.path().join("data.pbiv"); let builder = PersistentBitVecBuilder::new_ones(n, &path)?; Ok(Self { builder, temp }) } - pub(crate) fn freeze(self) -> io::Result { + pub fn freeze(self) -> io::Result { let Self { builder, temp } = self; let vec = builder.finish()?; Ok(TempBitVec { vec, _temp: temp }) @@ -72,7 +72,8 @@ impl TempBitVecBuilder { pub fn set(&mut self, slot: usize, value: bool) { self.builder.set(slot, value); } - pub(crate) fn view(&self) -> BitSliceView<'_> { + + pub fn view(&self) -> BitSliceView<'_> { self.builder.view() } @@ -80,19 +81,19 @@ impl TempBitVecBuilder { self.builder.or(other); } - pub(crate) fn and(&mut self, other: BitSliceView<'_>) { + pub fn and(&mut self, other: BitSliceView<'_>) { self.builder.and(other); } - pub(crate) fn xor(&mut self, other: BitSliceView<'_>) { + pub fn xor(&mut self, other: BitSliceView<'_>) { self.builder.xor(other); } - pub(crate) fn not(&mut self) { + pub fn not(&mut self) { self.builder.not(); } - pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) { + pub fn copy_from(&mut self, src: BitSliceView<'_>) { self.builder.copy_from(src); } @@ -100,11 +101,11 @@ impl TempBitVecBuilder { self.builder.or_where(col, pred); } - pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { self.builder.and_where(col, pred); } - pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { self.builder.xor_where(col, pred); } } diff --git a/src/obicompactvec/src/tempintvec.rs b/src/obicompactvec/src/tempintvec.rs index e5ff848..b0b3492 100644 --- a/src/obicompactvec/src/tempintvec.rs +++ b/src/obicompactvec/src/tempintvec.rs @@ -32,60 +32,58 @@ impl TempCompactIntVec { // ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ── -pub(crate) struct TempCompactIntVecBuilder { +pub struct TempCompactIntVecBuilder { builder: PersistentCompactIntVecBuilder, temp: TempDir, } impl TempCompactIntVecBuilder { - pub(crate) fn new(n: usize) -> io::Result { + pub fn new(n: usize) -> io::Result { let temp = TempDir::new()?; let path = temp.path().join("data.pciv"); let builder = PersistentCompactIntVecBuilder::new(n, &path)?; Ok(Self { builder, temp }) } - pub(crate) fn freeze(self) -> io::Result { + pub fn freeze(self) -> io::Result { let Self { builder, temp } = self; let vec = builder.finish()?; Ok(TempCompactIntVec { vec, _temp: temp }) } - // ── Delegation methods ──────────────────────────────────────────────────── + pub fn n(&self) -> usize { self.builder.len() } - pub(crate) fn n(&self) -> usize { self.builder.len() } + pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } + pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } - pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); } - pub(crate) fn get(&self, slot: usize) -> u32 { self.builder.get(slot) } + pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } + pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } - pub(crate) fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() } - pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() } - - pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) { + pub fn inc_present(&mut self, col: BitSliceView<'_>) { self.builder.inc_present(col); } - pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) { + pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) { self.builder.inc_present_fast(col); } - pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { self.builder.inc_predicate(col, pred); } - pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { + pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) { self.builder.inc_predicate_fast(col, pred); } - pub(crate) fn add(&mut self, other: IntSliceView<'_>) { + pub fn add(&mut self, other: IntSliceView<'_>) { self.builder.add(other); } - pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) { + pub fn mask_with(&mut self, mask: BitSliceView<'_>) { self.builder.mask_with(mask); } - pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); } - pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); } - pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); } + pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); } + pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); } + pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); } } diff --git a/src/obidebruinj/src/debruijn.rs b/src/obidebruinj/src/debruijn.rs index 8d300f2..f59f03a 100644 --- a/src/obidebruinj/src/debruijn.rs +++ b/src/obidebruinj/src/debruijn.rs @@ -3,6 +3,7 @@ use crossbeam_channel; use hashbrown::HashMap; use obikseq::k; use obikseq::{CanonicalKmer, Sequence, Unitig}; +#[cfg(not(any(test, feature = "test-utils")))] use rayon::iter::{IntoParallelRefIterator, ParallelIterator}; use std::cell::RefCell; use std::fmt; diff --git a/src/obikindex/src/merge.rs b/src/obikindex/src/merge.rs index c637c9b..cbfdaba 100644 --- a/src/obikindex/src/merge.rs +++ b/src/obikindex/src/merge.rs @@ -11,7 +11,7 @@ use obilayeredmap::IndexMode; use crate::error::{OKIError, OKIResult}; use crate::index::KmerIndex; use crate::meta::{GenomeInfo, IndexMeta}; -use crate::state::IndexState; +use crate::state::{IndexState, SENTINEL_INDEXED}; pub use obikpartitionner::MergeMode; @@ -263,6 +263,8 @@ impl KmerIndex { rep.push(t.stop()); } + fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?; + KmerIndex::open(output) } } diff --git a/src/obikmer/src/cmd/predicate.rs b/src/obikmer/src/cmd/predicate.rs index 04678f0..b1183d3 100644 --- a/src/obikmer/src/cmd/predicate.rs +++ b/src/obikmer/src/cmd/predicate.rs @@ -49,6 +49,11 @@ impl MetaPred { if values.iter().any(|v| v.is_empty()) { return Err(format!("empty value in predicate: {s}")); } + if matches!(op, PredOp::Matches | PredOp::NotMatches) { + if let Some(v) = values.iter().find(|v| !v.starts_with('/')) { + return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}")); + } + } Ok(Self { key, op, values }) } @@ -72,16 +77,12 @@ impl MetaPred { /// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy. /// -/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary. -/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere. +/// Both `value` and `pattern` must start with `/`. +/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`. fn path_matches(value: &str, pattern: &str) -> bool { - if pattern.starts_with('/') { - value == pattern - || (value.starts_with(pattern) - && value[pattern.len()..].starts_with('/')) - } else { - value.split('/').any(|seg| seg == pattern) - } + value == pattern + || (value.starts_with(pattern) + && value[pattern.len()..].starts_with('/')) } // ── Three-value group evaluation ────────────────────────────────────────────── diff --git a/test.sk.fasta b/test.sk.fasta deleted file mode 100644 index ff8e303..0000000 --- a/test.sk.fasta +++ /dev/null @@ -1,28 +0,0 @@ ->F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"} -GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA ->C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"} -AAAAAAATTAAGTCCCATATTGAAACATTAT ->9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"} -AAAAAATTAAGTCCCATATTGAAACATTATC ->74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"} -AAAAATTAAGTCCCATATTGAAACATTATCA ->45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"} -AAAATTAAGTCCCATATTGAAACATTATCAC ->5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"} -AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT ->8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"} -AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG ->C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"} -AATGTGAGTTGTTAATATTACATAATTGGGTT ->933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"} -TGTGAGTTGTTAATATTACATAATTGGGTTT ->3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"} -GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT ->1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"} -TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC ->8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"} -AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT ->247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"} -CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT ->11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"} -GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG