Push mtzqmmrlmzzx #34
+10
@@ -9,3 +9,13 @@ data-stress
|
|||||||
./**/*.json
|
./**/*.json
|
||||||
*.bin
|
*.bin
|
||||||
Betula_exilis--IGA-24-33
|
Betula_exilis--IGA-24-33
|
||||||
|
benchmark/genomes
|
||||||
|
benchmark/simulated_data
|
||||||
|
benchmark/specimen_index_presence
|
||||||
|
benchmark/specimen_index_count
|
||||||
|
benchmark/global_index_presence
|
||||||
|
benchmark/global_index_count
|
||||||
|
benchmark/stats
|
||||||
|
benchmark/reference_index
|
||||||
|
benchmark/specific_index_count
|
||||||
|
benchmark/specific_index_presence
|
||||||
|
|||||||
@@ -0,0 +1,2 @@
|
|||||||
|
/cache
|
||||||
|
/project.local.yml
|
||||||
@@ -0,0 +1,133 @@
|
|||||||
|
# the name by which the project can be referenced within Serena
|
||||||
|
project_name: "obikmer"
|
||||||
|
|
||||||
|
|
||||||
|
# list of languages for which language servers are started; choose from:
|
||||||
|
# al angular ansible bash clojure
|
||||||
|
# cpp cpp_ccls crystal csharp csharp_omnisharp
|
||||||
|
# dart elixir elm erlang fortran
|
||||||
|
# fsharp go groovy haskell haxe
|
||||||
|
# hlsl html java json julia
|
||||||
|
# kotlin lean4 lua luau markdown
|
||||||
|
# matlab msl nix ocaml pascal
|
||||||
|
# perl php php_phpactor powershell python
|
||||||
|
# python_jedi python_ty r rego ruby
|
||||||
|
# ruby_solargraph rust scala scss solidity
|
||||||
|
# svelte swift systemverilog terraform toml
|
||||||
|
# typescript typescript_vts vue yaml zig
|
||||||
|
# (This list may be outdated. For the current list, see values of Language enum here:
|
||||||
|
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
|
||||||
|
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
|
||||||
|
# Note:
|
||||||
|
# - For C, use cpp
|
||||||
|
# - For JavaScript, use typescript
|
||||||
|
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
|
||||||
|
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
|
||||||
|
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
|
||||||
|
# - For Free Pascal/Lazarus, use pascal
|
||||||
|
# Special requirements:
|
||||||
|
# Some languages require additional setup/installations.
|
||||||
|
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
|
||||||
|
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
||||||
|
# The first language is the default language and the respective language server will be used as a fallback.
|
||||||
|
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
||||||
|
languages:
|
||||||
|
- rust
|
||||||
|
|
||||||
|
# the encoding used by text files in the project
|
||||||
|
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
||||||
|
encoding: "utf-8"
|
||||||
|
|
||||||
|
# line ending convention to use when writing source files.
|
||||||
|
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
|
||||||
|
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
|
||||||
|
line_ending:
|
||||||
|
|
||||||
|
# The language backend to use for this project.
|
||||||
|
# If not set, the global setting from serena_config.yml is used.
|
||||||
|
# Valid values: LSP, JetBrains
|
||||||
|
# Note: the backend is fixed at startup. If a project with a different backend
|
||||||
|
# is activated post-init, an error will be returned.
|
||||||
|
language_backend:
|
||||||
|
|
||||||
|
# whether to use project's .gitignore files to ignore files
|
||||||
|
ignore_all_files_in_gitignore: true
|
||||||
|
|
||||||
|
# advanced configuration option allowing to configure language server-specific options.
|
||||||
|
# Maps the language key to the options.
|
||||||
|
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
|
||||||
|
# No documentation on options means no options are available.
|
||||||
|
ls_specific_settings: {}
|
||||||
|
|
||||||
|
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
|
||||||
|
# Paths can be absolute or relative to the project root.
|
||||||
|
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
|
||||||
|
# symbols and references across package boundaries.
|
||||||
|
# Currently supported for: TypeScript.
|
||||||
|
# Example:
|
||||||
|
# additional_workspace_folders:
|
||||||
|
# - ../sibling-package
|
||||||
|
# - ../shared-lib
|
||||||
|
additional_workspace_folders: []
|
||||||
|
|
||||||
|
# list of additional paths to ignore in this project.
|
||||||
|
# Same syntax as gitignore, so you can use * and **.
|
||||||
|
# Note: global ignored_paths from serena_config.yml are also applied additively.
|
||||||
|
ignored_paths: []
|
||||||
|
|
||||||
|
# whether the project is in read-only mode
|
||||||
|
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
||||||
|
# Added on 2025-04-18
|
||||||
|
read_only: false
|
||||||
|
|
||||||
|
# list of tool names to exclude.
|
||||||
|
# This extends the existing exclusions (e.g. from the global configuration)
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
excluded_tools: []
|
||||||
|
|
||||||
|
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
|
||||||
|
# This extends the existing inclusions (e.g. from the global configuration).
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
included_optional_tools: []
|
||||||
|
|
||||||
|
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
|
||||||
|
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
fixed_tools: []
|
||||||
|
|
||||||
|
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
|
||||||
|
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
|
||||||
|
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
|
||||||
|
# for this project.
|
||||||
|
# This setting can, in turn, be overridden by CLI parameters (--mode).
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
default_modes:
|
||||||
|
|
||||||
|
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
added_modes:
|
||||||
|
|
||||||
|
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
||||||
|
# (contrary to the memories, which are loaded on demand).
|
||||||
|
initial_prompt: ""
|
||||||
|
|
||||||
|
# time budget (seconds) per tool call for the retrieval of additional symbol information
|
||||||
|
# such as docstrings or parameter information.
|
||||||
|
# This overrides the corresponding setting in the global configuration; see the documentation there.
|
||||||
|
# If null or missing, use the setting from the global configuration.
|
||||||
|
symbol_info_budget:
|
||||||
|
|
||||||
|
# list of regex patterns which, when matched, mark a memory entry as read‑only.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
read_only_memory_patterns: []
|
||||||
|
|
||||||
|
# list of regex patterns for memories to completely ignore.
|
||||||
|
# Matching memories will not appear in list_memories or activate_project output
|
||||||
|
# and cannot be accessed via read_memory or write_memory.
|
||||||
|
# To access ignored memory files, use the read_file tool on the raw file path.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
# Example: ["_archive/.*", "_episodes/.*"]
|
||||||
|
ignored_memory_patterns: []
|
||||||
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
|
|||||||
---
|
---
|
||||||
|
|
||||||
Je continue à poser mes questions et à guider la discussion.
|
Je continue à poser mes questions et à guider la discussion.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## MCP Tools
|
||||||
|
|
||||||
|
**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
|
||||||
|
|
||||||
|
### Hiérarchie des outils pour ce projet Rust
|
||||||
|
|
||||||
|
**Navigation et édition de code → serena en priorité**
|
||||||
|
- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
|
||||||
|
- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
|
||||||
|
- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
|
||||||
|
- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
|
||||||
|
- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
|
||||||
|
- Ne pas utiliser `cclsp` quand serena couvre le besoin
|
||||||
|
|
||||||
|
**Analyse architecturale → jcodemunch**
|
||||||
|
- Hotspots, couplage, dead code, dépendances entre modules
|
||||||
|
- Utiliser avant de refactorer une zone critique
|
||||||
|
|
||||||
|
**Raisonnement complexe → sequential-thinking**
|
||||||
|
- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
|
||||||
|
|
||||||
|
**Documentation de crates → context7**
|
||||||
|
- Toujours consulter avant d'utiliser une API de bibliothèque externe
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
|
|||||||
mkdocs mkdocs-material \
|
mkdocs mkdocs-material \
|
||||||
mkdocs-mermaid2-plugin \
|
mkdocs-mermaid2-plugin \
|
||||||
mkdocs-bibtex
|
mkdocs-bibtex
|
||||||
|
$(PIP) install --quiet --upgrade InSilicoSeq
|
||||||
|
|
||||||
# ── obikmer binary ───────────────────────────────────────────────────────────
|
# ── obikmer binary ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,144 @@
|
|||||||
|
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
|
||||||
|
BINARY := ../src/target/release/obikmer
|
||||||
|
VENV_PY := ../.venv/bin/python3
|
||||||
|
|
||||||
|
GENOMES := $(wildcard genomes/*.fna.gz)
|
||||||
|
|
||||||
|
# SPECIMENS, SPECIES, and the full dependency graph are generated by
|
||||||
|
# make_deps.py from the genome FASTA headers — like .d files in C.
|
||||||
|
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
|
||||||
|
-include deps.mk
|
||||||
|
|
||||||
|
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||||
|
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||||
|
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||||
|
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||||
|
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
|
||||||
|
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
|
||||||
|
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
|
||||||
|
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
|
||||||
|
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
|
||||||
|
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
|
||||||
|
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
|
||||||
|
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
|
||||||
|
|
||||||
|
.NOTPARALLEL:
|
||||||
|
|
||||||
|
.PHONY: all simulate reference \
|
||||||
|
index_presence index_count \
|
||||||
|
aggregate_index_presence aggregate_index_count \
|
||||||
|
merge_presence merge_count \
|
||||||
|
verify_presence verify_count \
|
||||||
|
aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
filter_presence filter_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count
|
||||||
|
|
||||||
|
verify_merge_presence: stats/verify_merge_presence/current.csv
|
||||||
|
verify_merge_count: stats/verify_merge_count/current.csv
|
||||||
|
|
||||||
|
all: aggregate_verify_presence aggregate_verify_count \
|
||||||
|
verify_merge_presence verify_merge_count \
|
||||||
|
aggregate_filter_presence aggregate_filter_count
|
||||||
|
|
||||||
|
# ── dependency file ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
deps.mk: $(GENOMES)
|
||||||
|
$(VENV_PY) make_deps.py $^ > $@
|
||||||
|
|
||||||
|
# ── simulation ────────────────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
|
||||||
|
|
||||||
|
$(SIMULATED_READS):
|
||||||
|
bash simulate_one.sh $< $(dir $@)
|
||||||
|
|
||||||
|
simulate: $(SIMULATED_READS)
|
||||||
|
|
||||||
|
# ── reference kmer sets ───────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → npz) are in deps.mk.
|
||||||
|
|
||||||
|
reference_index/%.npz:
|
||||||
|
bash build_reference.sh $*
|
||||||
|
|
||||||
|
reference: $(REF_NPZS)
|
||||||
|
|
||||||
|
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||||
|
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||||
|
|
||||||
|
specimen_index_presence/%/index.done \
|
||||||
|
stats/indexing_presence/%.stats &: $(BINARY)
|
||||||
|
bash index_one_presence.sh $*
|
||||||
|
|
||||||
|
specimen_index_count/%/index.done \
|
||||||
|
stats/indexing_count/%.stats &: $(BINARY)
|
||||||
|
bash index_one_count.sh $*
|
||||||
|
|
||||||
|
index_presence: $(PRESENCE_DONE)
|
||||||
|
index_count: $(COUNT_DONE)
|
||||||
|
|
||||||
|
# ── indexing stats aggregation ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_index_presence: $(PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_presence
|
||||||
|
|
||||||
|
aggregate_index_count: $(COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh indexing_count
|
||||||
|
|
||||||
|
# ── global merge ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
|
||||||
|
bash merge_presence.sh
|
||||||
|
|
||||||
|
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
|
||||||
|
bash merge_count.sh
|
||||||
|
|
||||||
|
merge_presence: global_index_presence/index.done
|
||||||
|
merge_count: global_index_count/index.done
|
||||||
|
|
||||||
|
# ── per-specimen verification ─────────────────────────────────────────────────
|
||||||
|
# Prerequisites (index.done + npz → .stats) are in deps.mk.
|
||||||
|
|
||||||
|
stats/verify_presence/%.stats:
|
||||||
|
bash verify_one_presence.sh $*
|
||||||
|
|
||||||
|
stats/verify_count/%.stats:
|
||||||
|
bash verify_one_count.sh $*
|
||||||
|
|
||||||
|
verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
|
||||||
|
# ── verification stats aggregation ───────────────────────────────────────────
|
||||||
|
|
||||||
|
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh verify_presence
|
||||||
|
|
||||||
|
aggregate_verify_count: $(VERIFY_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh verify_count
|
||||||
|
|
||||||
|
# ── species-specific indexes ──────────────────────────────────────────────────
|
||||||
|
# Prerequisites (global index → specific index) are in deps.mk.
|
||||||
|
|
||||||
|
specific_index_presence/%/index.done \
|
||||||
|
stats/specific_kmer_presence/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_presence.sh $*
|
||||||
|
|
||||||
|
specific_index_count/%/index.done \
|
||||||
|
stats/specific_kmer_count/%.stats &: $(BINARY)
|
||||||
|
bash filter_one_count.sh $*
|
||||||
|
|
||||||
|
filter_presence: $(SPECIFIC_PRESENCE_DONE)
|
||||||
|
filter_count: $(SPECIFIC_COUNT_DONE)
|
||||||
|
|
||||||
|
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_presence
|
||||||
|
|
||||||
|
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
|
||||||
|
bash aggregate_stats.sh specific_kmer_count
|
||||||
|
|
||||||
|
# ── merged index verification ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
|
||||||
|
bash verify_merge_presence.sh
|
||||||
|
|
||||||
|
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
|
||||||
|
bash verify_merge_count.sh
|
||||||
@@ -0,0 +1,132 @@
|
|||||||
|
# Benchmark pipeline
|
||||||
|
|
||||||
|
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
|
||||||
|
|
||||||
|
```
|
||||||
|
gmake all # full pipeline
|
||||||
|
gmake simulate # simulation only
|
||||||
|
gmake reference # reference kmer sets only
|
||||||
|
```
|
||||||
|
|
||||||
|
## Pipeline overview
|
||||||
|
|
||||||
|
```mermaid
|
||||||
|
flowchart TD
|
||||||
|
GENOMES["genomes/*.fna.gz"]
|
||||||
|
BIN["obikmer binary"]
|
||||||
|
|
||||||
|
GENOMES --> simulate
|
||||||
|
simulate --> simdata[("simulated_data/")]
|
||||||
|
|
||||||
|
simdata --> reference
|
||||||
|
reference --> refnpz[("reference_index/*.npz")]
|
||||||
|
|
||||||
|
subgraph presence ["Presence track"]
|
||||||
|
simdata --> index_presence
|
||||||
|
BIN --> index_presence
|
||||||
|
index_presence --> pres_done[("specimen_index_presence/")]
|
||||||
|
index_presence --> pres_istats[("stats/indexing_presence/")]
|
||||||
|
pres_istats --> aggregate_index_presence
|
||||||
|
|
||||||
|
pres_done --> merge_presence
|
||||||
|
BIN --> merge_presence
|
||||||
|
merge_presence --> gpres[("global_index_presence/")]
|
||||||
|
|
||||||
|
refnpz --> verify_presence
|
||||||
|
pres_done --> verify_presence
|
||||||
|
verify_presence --> vpres_stats[("stats/verify_presence/")]
|
||||||
|
vpres_stats --> aggregate_verify_presence
|
||||||
|
|
||||||
|
gpres --> filter_presence
|
||||||
|
BIN --> filter_presence
|
||||||
|
filter_presence --> spec_pres[("specific_index_presence/")]
|
||||||
|
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
|
||||||
|
spec_pres_stats --> aggregate_filter_presence
|
||||||
|
|
||||||
|
refnpz --> verify_merge_presence
|
||||||
|
gpres --> verify_merge_presence
|
||||||
|
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
subgraph count ["Count track"]
|
||||||
|
simdata --> index_count
|
||||||
|
BIN --> index_count
|
||||||
|
index_count --> count_done[("specimen_index_count/")]
|
||||||
|
index_count --> count_istats[("stats/indexing_count/")]
|
||||||
|
count_istats --> aggregate_index_count
|
||||||
|
|
||||||
|
count_done --> merge_count
|
||||||
|
BIN --> merge_count
|
||||||
|
merge_count --> gcount[("global_index_count/")]
|
||||||
|
|
||||||
|
refnpz --> verify_count
|
||||||
|
count_done --> verify_count
|
||||||
|
verify_count --> vcount_stats[("stats/verify_count/")]
|
||||||
|
vcount_stats --> aggregate_verify_count
|
||||||
|
|
||||||
|
gcount --> filter_count
|
||||||
|
BIN --> filter_count
|
||||||
|
filter_count --> spec_count[("specific_index_count/")]
|
||||||
|
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
|
||||||
|
spec_count_stats --> aggregate_filter_count
|
||||||
|
|
||||||
|
refnpz --> verify_merge_count
|
||||||
|
gcount --> verify_merge_count
|
||||||
|
verify_merge_count --> vmc[("stats/verify_merge_count/")]
|
||||||
|
end
|
||||||
|
|
||||||
|
aggregate_verify_presence --> all
|
||||||
|
aggregate_verify_count --> all
|
||||||
|
vmp --> all
|
||||||
|
vmc --> all
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
|
||||||
|
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
|
||||||
|
```
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
| Target | Script | Description |
|
||||||
|
|---|---|---|
|
||||||
|
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
|
||||||
|
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
|
||||||
|
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
|
||||||
|
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
|
||||||
|
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
|
||||||
|
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
|
||||||
|
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
|
||||||
|
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
|
||||||
|
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
|
||||||
|
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
|
||||||
|
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
|
||||||
|
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
|
||||||
|
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
|
||||||
|
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
|
||||||
|
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
|
||||||
|
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
|
||||||
|
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
|
||||||
|
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
|
||||||
|
|
||||||
|
## Directory layout
|
||||||
|
|
||||||
|
```
|
||||||
|
benchmark/
|
||||||
|
├── genomes/ # input reference genomes (.fna.gz)
|
||||||
|
├── simulated_data/ # generated by simulate
|
||||||
|
│ └── <species>/<specimen>/
|
||||||
|
├── reference_index/ # reference kmer sets (.npz)
|
||||||
|
├── specimen_index_presence/ # per-specimen presence indexes
|
||||||
|
├── specimen_index_count/ # per-specimen count indexes
|
||||||
|
├── global_index_presence/ # merged global presence index
|
||||||
|
├── global_index_count/ # merged global count index
|
||||||
|
├── specific_index_presence/ # species-specific presence indexes
|
||||||
|
├── specific_index_count/ # species-specific count indexes
|
||||||
|
└── stats/ # all benchmark statistics
|
||||||
|
├── indexing_presence/
|
||||||
|
├── indexing_count/
|
||||||
|
├── verify_presence/
|
||||||
|
├── verify_count/
|
||||||
|
├── specific_kmer_presence/
|
||||||
|
├── specific_kmer_count/
|
||||||
|
├── verify_merge_presence/
|
||||||
|
└── verify_merge_count/
|
||||||
|
```
|
||||||
Executable
+53
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: aggregate_stats.sh TYPE
|
||||||
|
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
||||||
|
#
|
||||||
|
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
||||||
|
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
||||||
|
# the most recent run CSV (idempotent when nothing changed).
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
TYPE="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
||||||
|
|
||||||
|
case "${TYPE}" in
|
||||||
|
indexing_presence|indexing_count)
|
||||||
|
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
||||||
|
;;
|
||||||
|
verify_presence)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
||||||
|
;;
|
||||||
|
verify_count)
|
||||||
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
||||||
|
;;
|
||||||
|
specific_kmer_presence|specific_kmer_count)
|
||||||
|
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# Find most recent existing run CSV (empty string if none).
|
||||||
|
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
||||||
|
|
||||||
|
# Check if any .stats file is newer than the latest run CSV.
|
||||||
|
if [[ -n "${latest_csv}" ]] && \
|
||||||
|
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
||||||
|
echo "[${TYPE}] stats up to date (${latest_csv})"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
echo "${HEADER}" >"${CSV}"
|
||||||
|
|
||||||
|
# Sort .stats files by name for reproducible row order.
|
||||||
|
while IFS= read -r stats_file; do
|
||||||
|
sed "s/^/${run_n},/" "${stats_file}"
|
||||||
|
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
||||||
|
|
||||||
|
echo "[${TYPE}] run ${run_n} → ${CSV}"
|
||||||
Executable
+137
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Build a reference kmer index from paired-end FASTQ reads.
|
||||||
|
|
||||||
|
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
|
||||||
|
counts their abundances, and saves a sorted numpy pair (kmers, counts).
|
||||||
|
|
||||||
|
Output .npz arrays
|
||||||
|
kmers : uint64, sorted ascending — canonical kmer integers
|
||||||
|
counts : uint32, same order — raw read abundances
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
# Lookup table: revcomp of one byte (4 bases, 8 bits).
|
||||||
|
# Precomputed once at import time.
|
||||||
|
_REVCOMP8 = [0] * 256
|
||||||
|
for _i in range(256):
|
||||||
|
_rc, _x = 0, _i
|
||||||
|
for _ in range(4):
|
||||||
|
_rc = (_rc << 2) | (3 - (_x & 3))
|
||||||
|
_x >>= 2
|
||||||
|
_REVCOMP8[_i] = _rc
|
||||||
|
del _i, _rc, _x
|
||||||
|
|
||||||
|
|
||||||
|
def revcomp_int(kmer: int, k: int) -> int:
|
||||||
|
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
|
||||||
|
|
||||||
|
Uses byte-level lookup (4 bases at a time) for speed.
|
||||||
|
"""
|
||||||
|
rc = 0
|
||||||
|
bits_left = 2 * k
|
||||||
|
while bits_left > 0:
|
||||||
|
chunk = min(8, bits_left)
|
||||||
|
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
|
||||||
|
rc = (rc << chunk) | rc_byte
|
||||||
|
kmer >>= chunk
|
||||||
|
bits_left -= chunk
|
||||||
|
return rc
|
||||||
|
|
||||||
|
|
||||||
|
# ── FASTQ parsing ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def iter_sequences(path: str):
|
||||||
|
"""Yield raw sequences from a (gzipped) FASTQ file."""
|
||||||
|
opener = gzip.open if path.endswith('.gz') else open
|
||||||
|
with opener(path, 'rt') as fh:
|
||||||
|
while True:
|
||||||
|
if not fh.readline(): # '@' header
|
||||||
|
break
|
||||||
|
seq = fh.readline().rstrip('\n')
|
||||||
|
fh.readline() # '+'
|
||||||
|
fh.readline() # quality
|
||||||
|
yield seq
|
||||||
|
|
||||||
|
|
||||||
|
# ── kmer counting ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
|
||||||
|
mask = (1 << (2 * k)) - 1
|
||||||
|
counts: dict[int, int] = defaultdict(int)
|
||||||
|
n_reads = 0
|
||||||
|
|
||||||
|
for path in paths:
|
||||||
|
for seq in iter_sequences(path):
|
||||||
|
n_reads += 1
|
||||||
|
kmer = 0
|
||||||
|
run = 0 # consecutive valid bases
|
||||||
|
|
||||||
|
for c in seq:
|
||||||
|
b = _ENCODE.get(c)
|
||||||
|
if b is None: # N or unexpected character → reset
|
||||||
|
kmer = 0
|
||||||
|
run = 0
|
||||||
|
continue
|
||||||
|
kmer = ((kmer << 2) | b) & mask
|
||||||
|
run += 1
|
||||||
|
if run >= k:
|
||||||
|
rc = revcomp_int(kmer, k)
|
||||||
|
counts[kmer if kmer <= rc else rc] += 1
|
||||||
|
|
||||||
|
if n_reads % 100_000 == 0:
|
||||||
|
print(f' {n_reads:,} reads processed, '
|
||||||
|
f'{len(counts):,} distinct kmers so far',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
|
||||||
|
file=sys.stderr)
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reads', nargs='+', metavar='FASTQ',
|
||||||
|
help='Input reads (FASTQ, gzip OK)')
|
||||||
|
ap.add_argument('-k', '--kmer-size', type=int, default=31,
|
||||||
|
metavar='K')
|
||||||
|
ap.add_argument('--min-abundance', type=int, default=1,
|
||||||
|
metavar='N', help='Drop kmers with count < N (default 1)')
|
||||||
|
ap.add_argument('-o', '--output', required=True,
|
||||||
|
metavar='FILE', help='Output .npz path')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
|
||||||
|
counts = count_kmers(args.reads, args.kmer_size)
|
||||||
|
|
||||||
|
if args.min_abundance > 1:
|
||||||
|
before = len(counts)
|
||||||
|
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
|
||||||
|
print(f' min-abundance={args.min_abundance}: '
|
||||||
|
f'{before - len(counts):,} kmers dropped, '
|
||||||
|
f'{len(counts):,} retained',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'Sorting and saving → {args.output}', file=sys.stderr)
|
||||||
|
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
|
||||||
|
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
|
||||||
|
|
||||||
|
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
|
||||||
|
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+39
@@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
|
||||||
|
|
||||||
|
KMER_SIZE="${KMER_SIZE:-31}"
|
||||||
|
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
|
||||||
|
|
||||||
|
mkdir -p "${REF_DIR}"
|
||||||
|
|
||||||
|
for species_dir in "${SIMDATA_DIR}"/*/; do
|
||||||
|
[[ -d "${species_dir}" ]] || continue
|
||||||
|
species=$(basename "${species_dir}")
|
||||||
|
|
||||||
|
for strain_dir in "${species_dir}"*/; do
|
||||||
|
[[ -d "${strain_dir}" ]] || continue
|
||||||
|
strain=$(basename "${strain_dir}")
|
||||||
|
|
||||||
|
r1="${strain_dir}/reads_R1.fastq.gz"
|
||||||
|
r2="${strain_dir}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "SKIP ${species}--${strain}: reads not found" >&2
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
out="${REF_DIR}/${species}--${strain}.npz"
|
||||||
|
echo "[${species}--${strain}] → ${out}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${BUILD_PY}" \
|
||||||
|
--kmer-size "${KMER_SIZE}" \
|
||||||
|
--min-abundance "${MIN_ABUNDANCE}" \
|
||||||
|
--output "${out}" \
|
||||||
|
"${r1}" "${r2}"
|
||||||
|
done
|
||||||
|
done
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_MG1655
|
||||||
|
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--EDL933
|
||||||
|
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--LT2
|
||||||
|
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--CFT073
|
||||||
|
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
|
||||||
|
|
||||||
|
# Bacillus_subtilis--168
|
||||||
|
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
|
||||||
|
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
|
||||||
|
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--P125109
|
||||||
|
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
|
||||||
|
|
||||||
|
# Shouchella_clausii--KSM-K16
|
||||||
|
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
|
||||||
|
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli--K-12_W3110
|
||||||
|
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
|
||||||
|
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--MGH_78578
|
||||||
|
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
|
||||||
|
|
||||||
|
# Opitutus_terrae--PB90-1
|
||||||
|
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
|
||||||
|
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
|
||||||
|
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
|
||||||
|
|
||||||
|
# Saccharolobus_islandicus--M.16.4
|
||||||
|
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
|
||||||
|
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
|
||||||
|
|
||||||
|
# Acidobacterium_capsulatum--ATCC_51196
|
||||||
|
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
|
||||||
|
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--AKU_12601
|
||||||
|
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
|
||||||
|
|
||||||
|
# Proteus_mirabilis--HI4320
|
||||||
|
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
|
||||||
|
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
|
||||||
|
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
|
||||||
|
|
||||||
|
# Salmonella_enterica--CT18
|
||||||
|
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
|
||||||
|
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
|
||||||
|
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--HS11286
|
||||||
|
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
|
||||||
|
|
||||||
|
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
|
||||||
|
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
|
||||||
|
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
|
||||||
|
|
||||||
|
# Klebsiella_pneumoniae--ATCC_13883
|
||||||
|
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
|
||||||
|
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
|
||||||
|
|
||||||
|
# Yersinia_ruckeri--YRB
|
||||||
|
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
|
||||||
|
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
|
||||||
|
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
|
||||||
|
|
||||||
|
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
|
||||||
|
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
|
||||||
|
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
|
||||||
|
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
|
||||||
|
|
||||||
|
# Escherichia_coli
|
||||||
|
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
|
||||||
|
# Salmonella_enterica
|
||||||
|
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
|
||||||
|
# Bacillus_subtilis
|
||||||
|
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
|
||||||
|
# Shouchella_clausii
|
||||||
|
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
|
||||||
|
# Klebsiella_pneumoniae
|
||||||
|
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
|
||||||
|
# Opitutus_terrae
|
||||||
|
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
|
||||||
|
# Saccharolobus_islandicus
|
||||||
|
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
|
||||||
|
# Acidobacterium_capsulatum
|
||||||
|
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
|
||||||
|
# Proteus_mirabilis
|
||||||
|
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
|
||||||
|
# Wolbachia_endosymbiont
|
||||||
|
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
|
||||||
|
# Yersinia_ruckeri
|
||||||
|
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
|
||||||
|
# Candidozyma_auris
|
||||||
|
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
|
||||||
|
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
|
||||||
Executable
+48
@@ -0,0 +1,48 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
assemblies=(
|
||||||
|
GCF_000005845.2
|
||||||
|
GCF_000010245.2
|
||||||
|
GCF_000007445.1
|
||||||
|
GCF_000006665.1
|
||||||
|
|
||||||
|
GCF_000006945.2
|
||||||
|
GCF_000195995.1
|
||||||
|
GCF_000009505.1
|
||||||
|
GCF_000026565.1
|
||||||
|
|
||||||
|
GCF_000016305.1
|
||||||
|
GCF_000019965.1
|
||||||
|
GCF_000240185.1
|
||||||
|
GCF_000742135.1
|
||||||
|
|
||||||
|
GCF_000069965.1
|
||||||
|
GCF_000022565.1
|
||||||
|
GCF_000306885.1
|
||||||
|
GCF_003013715.1
|
||||||
|
|
||||||
|
GCF_000009045.1
|
||||||
|
GCF_000009825.1
|
||||||
|
GCF_000022445.1
|
||||||
|
GCF_000834255.1
|
||||||
|
)
|
||||||
|
|
||||||
|
mkdir -p genomes
|
||||||
|
|
||||||
|
for acc in "${assemblies[@]}"; do
|
||||||
|
echo "Downloading ${acc}"
|
||||||
|
|
||||||
|
datasets download genome accession "${acc}" \
|
||||||
|
--include genome \
|
||||||
|
--filename "${acc}.zip"
|
||||||
|
|
||||||
|
unzip -q "${acc}.zip" -d "${acc}"
|
||||||
|
find "${acc}" -name "*.fna" |
|
||||||
|
while read file; do
|
||||||
|
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf "${acc}" "${acc}.zip"
|
||||||
|
done
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_count.sh SPECIES
|
||||||
|
# Filters global_index_count to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_count/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+108
@@ -0,0 +1,108 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: filter_one_presence.sh SPECIES
|
||||||
|
# Filters global_index_presence to keep only kmers specific to SPECIES,
|
||||||
|
# then selects the SPECIES column in-place.
|
||||||
|
# Outputs:
|
||||||
|
# specific_index_presence/SPECIES/index.done (written by obikmer select)
|
||||||
|
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIES="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
SOURCE="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
|
||||||
|
|
||||||
|
LOG_FILTER=$(mktemp)
|
||||||
|
LOG_SELECT=$(mktemp)
|
||||||
|
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" filter \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--ingroup "species=${SPECIES}" \
|
||||||
|
--outgroup all \
|
||||||
|
--min-frac 0.5 \
|
||||||
|
--max-frac 1.0 \
|
||||||
|
--max-outgroup-count 0 \
|
||||||
|
"${SOURCE}" \
|
||||||
|
2>"${LOG_FILTER}"
|
||||||
|
|
||||||
|
cat "${LOG_FILTER}" >&2
|
||||||
|
|
||||||
|
"${BINARY}" select \
|
||||||
|
--in-place \
|
||||||
|
--group "${SPECIES}:species=${SPECIES}" \
|
||||||
|
--group-op "${SPECIES}:any" \
|
||||||
|
--select "${SPECIES}" \
|
||||||
|
"${OUTPUT}" \
|
||||||
|
2>"${LOG_SELECT}"
|
||||||
|
|
||||||
|
cat "${LOG_SELECT}" >&2
|
||||||
|
|
||||||
|
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
def parse_reporter(logfile):
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats['TOTAL'] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
return stats
|
||||||
|
|
||||||
|
f = parse_reporter(log_filter)
|
||||||
|
s = parse_reporter(log_select)
|
||||||
|
|
||||||
|
row = [species]
|
||||||
|
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
|
||||||
|
key = 'TOTAL' if stage.endswith('_total') else stage
|
||||||
|
w, r = d.get(key, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--with-counts \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
Executable
+102
@@ -0,0 +1,102 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: index_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Outputs:
|
||||||
|
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
|
||||||
|
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||||
|
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||||
|
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||||
|
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||||
|
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" index \
|
||||||
|
--output "${INDEX_PATH}" \
|
||||||
|
--force \
|
||||||
|
--theta 0 \
|
||||||
|
--label "${SPECIMEN}" \
|
||||||
|
--meta "species=${species}" \
|
||||||
|
"${r1}" "${r2}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
|
||||||
|
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s): state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s): state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||||
|
row = [species, strain]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
|
||||||
|
|
||||||
|
Like C .d files: only target: prerequisites lines, no recipes.
|
||||||
|
Recipes stay in the Makefile as generic rules.
|
||||||
|
"""
|
||||||
|
import gzip
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
|
||||||
|
'endosymbiont', 'of'}
|
||||||
|
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
|
||||||
|
|
||||||
|
|
||||||
|
def is_stop(tok):
|
||||||
|
t = tok.lower()
|
||||||
|
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize(s):
|
||||||
|
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
|
||||||
|
|
||||||
|
|
||||||
|
def collect_tokens(text):
|
||||||
|
parts = []
|
||||||
|
for tok in text.split():
|
||||||
|
tok = tok.rstrip(',.')
|
||||||
|
if is_stop(tok):
|
||||||
|
break
|
||||||
|
parts.append(sanitize(tok))
|
||||||
|
return '_'.join(filter(None, parts))
|
||||||
|
|
||||||
|
|
||||||
|
def parse_organism(defn, gcf_id):
|
||||||
|
words = defn.split()
|
||||||
|
species = sanitize(words[0] + '_' + words[1])
|
||||||
|
|
||||||
|
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
|
||||||
|
if m:
|
||||||
|
strain = sanitize(m.group(1))
|
||||||
|
if m.group(2):
|
||||||
|
strain += '_' + sanitize(m.group(2))
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
m = re.search(r'\bstrain\b\s+(.*)', defn)
|
||||||
|
if m:
|
||||||
|
strain = collect_tokens(m.group(1))
|
||||||
|
if strain:
|
||||||
|
return species, strain
|
||||||
|
|
||||||
|
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
|
||||||
|
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
|
||||||
|
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
|
||||||
|
strain = collect_tokens(remainder)
|
||||||
|
return species, strain if strain else gcf_id
|
||||||
|
|
||||||
|
|
||||||
|
def first_definition(path):
|
||||||
|
with gzip.open(path, 'rt') as fh:
|
||||||
|
for line in fh:
|
||||||
|
if line.startswith('>'):
|
||||||
|
m = re.search(r'"definition":"([^"]*)"', line)
|
||||||
|
return m.group(1) if m else line[1:].split()[0]
|
||||||
|
return Path(path).stem
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
entries = [] # (specimen, species, sim_dir, genome_path)
|
||||||
|
species_seen = []
|
||||||
|
|
||||||
|
for path in sorted(sys.argv[1:]):
|
||||||
|
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
|
||||||
|
defn = first_definition(path)
|
||||||
|
sp, st = parse_organism(defn, gcf_id)
|
||||||
|
specimen = f'{sp}--{st}'
|
||||||
|
sim_dir = f'simulated_data/{sp}/{st}'
|
||||||
|
entries.append((specimen, sp, sim_dir, path))
|
||||||
|
if sp not in species_seen:
|
||||||
|
species_seen.append(sp)
|
||||||
|
|
||||||
|
specimens = [e[0] for e in entries]
|
||||||
|
print('SPECIMENS :=', ' '.join(specimens))
|
||||||
|
print('SPECIES :=', ' '.join(species_seen))
|
||||||
|
|
||||||
|
for specimen, species, sim_dir, genome in entries:
|
||||||
|
reads = f'{sim_dir}/reads_R1.fastq.gz'
|
||||||
|
p_done = f'specimen_index_presence/{specimen}/index.done'
|
||||||
|
p_stats = f'stats/indexing_presence/{specimen}.stats'
|
||||||
|
c_done = f'specimen_index_count/{specimen}/index.done'
|
||||||
|
c_stats = f'stats/indexing_count/{specimen}.stats'
|
||||||
|
ref = f'reference_index/{specimen}.npz'
|
||||||
|
vp = f'stats/verify_presence/{specimen}.stats'
|
||||||
|
vc = f'stats/verify_count/{specimen}.stats'
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f'# {specimen}')
|
||||||
|
print(f'{reads}: {genome}')
|
||||||
|
print(f'{ref}: {reads}')
|
||||||
|
print(f'{p_done} {p_stats}: {reads}')
|
||||||
|
print(f'{c_done} {c_stats}: {reads}')
|
||||||
|
print(f'{vp}: {ref} {p_done}')
|
||||||
|
print(f'{vc}: {ref} {c_done}')
|
||||||
|
|
||||||
|
print()
|
||||||
|
for sp in species_seen:
|
||||||
|
sp_done = f'specific_index_presence/{sp}/index.done'
|
||||||
|
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
|
||||||
|
sc_done = f'specific_index_count/{sp}/index.done'
|
||||||
|
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
|
||||||
|
print(f'# {sp}')
|
||||||
|
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
|
||||||
|
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+103
@@ -0,0 +1,103 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_count"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+104
@@ -0,0 +1,104 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
|
||||||
|
OUTPUT="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||||
|
|
||||||
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||||||
|
|
||||||
|
parse_reporter() {
|
||||||
|
local run="$1" n_sources="$2" logfile="$3"
|
||||||
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||||||
|
import sys, re
|
||||||
|
|
||||||
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||||
|
|
||||||
|
def strip_ansi(s):
|
||||||
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||||
|
|
||||||
|
def parse_wall(s):
|
||||||
|
s = s.strip()
|
||||||
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||||
|
if s.endswith('s'): return float(s[:-1])
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def parse_rss(s):
|
||||||
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||||
|
if not m: return 0
|
||||||
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||||
|
|
||||||
|
def is_sep(s):
|
||||||
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||||
|
|
||||||
|
stats = {}
|
||||||
|
state = 'scan'
|
||||||
|
|
||||||
|
with open(logfile, errors='replace') as fh:
|
||||||
|
for raw in fh:
|
||||||
|
line = strip_ansi(raw.rstrip('\n'))
|
||||||
|
s = line.strip()
|
||||||
|
|
||||||
|
if state == 'scan':
|
||||||
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||||
|
state = 'in_header'
|
||||||
|
elif state == 'in_header':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'rows'
|
||||||
|
elif state == 'rows':
|
||||||
|
if is_sep(s):
|
||||||
|
state = 'total'
|
||||||
|
elif s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 4:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||||
|
elif state == 'total':
|
||||||
|
if s:
|
||||||
|
parts = re.split(r' +', s)
|
||||||
|
if len(parts) >= 3:
|
||||||
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||||||
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||||
|
break
|
||||||
|
|
||||||
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||||||
|
row = [run, n_sources]
|
||||||
|
for stage in STAGE_ORDER:
|
||||||
|
w, r = stats.get(stage, ('', ''))
|
||||||
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||||
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||||||
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||||
|
print(','.join(row))
|
||||||
|
PYEOF
|
||||||
|
}
|
||||||
|
|
||||||
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||||||
|
|
||||||
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||||||
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
|
||||||
|
printf ' %s\n' "${sources[@]}"
|
||||||
|
|
||||||
|
STDERR_LOG=$(mktemp)
|
||||||
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||||
|
|
||||||
|
"${BINARY}" merge \
|
||||||
|
--output "${OUTPUT}" \
|
||||||
|
--force \
|
||||||
|
--force-presence \
|
||||||
|
"${sources[@]}" \
|
||||||
|
2>"${STDERR_LOG}"
|
||||||
|
|
||||||
|
cat "${STDERR_LOG}" >&2
|
||||||
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||||||
|
|
||||||
|
echo "Done. Run ${run_n} → ${CSV}"
|
||||||
Executable
+12
@@ -0,0 +1,12 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Simulate all genomes. Delegates to simulate_one.sh per genome.
|
||||||
|
# Prefer running via `gmake simulate` which handles individual dependencies.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
|
||||||
|
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
|
||||||
|
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
|
||||||
|
--dir-for "${genome_file}")
|
||||||
|
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
|
||||||
|
done
|
||||||
@@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: simulate_one.sh genome.fna.gz output_dir
|
||||||
|
# Simulates paired-end HiSeq reads for a single genome.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
||||||
|
COVERAGE=15
|
||||||
|
READ_LENGTH=150
|
||||||
|
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
||||||
|
|
||||||
|
genome_file="$1"
|
||||||
|
out_dir="$2"
|
||||||
|
|
||||||
|
mkdir -p "${out_dir}"
|
||||||
|
|
||||||
|
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
||||||
|
trap 'rm -f "${tmp_fasta}"' EXIT
|
||||||
|
|
||||||
|
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
||||||
|
|
||||||
|
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
||||||
|
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
||||||
|
|
||||||
|
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
||||||
|
|
||||||
|
"${ISS}" generate \
|
||||||
|
--genomes "${tmp_fasta}" \
|
||||||
|
--model HiSeq \
|
||||||
|
--n_reads "${n_reads}" \
|
||||||
|
--cpus "${CPUS}" \
|
||||||
|
--compress \
|
||||||
|
--output "${out_dir}/reads"
|
||||||
Executable
+181
@@ -0,0 +1,181 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer count index against a reference kmer set (presence + counts).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
|
||||||
|
streams `obikmer dump` from a --with-counts index, then reports:
|
||||||
|
- false negatives : kmers in reference absent from the index
|
||||||
|
- false positives : kmers in the index absent from the reference
|
||||||
|
- count mismatches: kmers present in both but with differing counts
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers, counts = [], []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmers.append(encode_kmer(parts[0]))
|
||||||
|
counts.append(int(parts[1]))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
|
||||||
|
return (np.array(kmers, dtype=np.uint64)[order],
|
||||||
|
np.array(counts, dtype=np.uint32)[order])
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
|
||||||
|
idx_kmers: np.ndarray, idx_counts: np.ndarray,
|
||||||
|
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
|
||||||
|
|
||||||
|
All arrays sorted; cm_* cover kmers present in both arrays but with
|
||||||
|
differing counts.
|
||||||
|
"""
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers.
|
||||||
|
# Both arrays are sorted so we can use searchsorted.
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
|
||||||
|
shared_ref_counts = ref_counts[shared_mask]
|
||||||
|
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
mismatch_mask = shared_ref_counts != shared_idx_counts
|
||||||
|
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref_counts = shared_ref_counts[mismatch_mask]
|
||||||
|
cm_idx_counts = shared_idx_counts[mismatch_mask]
|
||||||
|
|
||||||
|
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
|
||||||
|
help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='obikmer index directory (built with --with-counts)')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer',
|
||||||
|
help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='')
|
||||||
|
ap.add_argument('--strain', default='')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-cm', metavar='FILE',
|
||||||
|
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
|
||||||
|
ref_kmers, ref_counts, idx_kmers, idx_counts)
|
||||||
|
|
||||||
|
n_shared = len(ref_kmers) - len(false_neg)
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
|
||||||
|
if args.save_cm and len(cm_kmers):
|
||||||
|
with open(args.save_cm, 'w') as fh:
|
||||||
|
fh.write('kmer,ref_count,idx_count\n')
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+201
@@ -0,0 +1,201 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged count index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer+count pairs from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
|
||||||
|
fn_pct,fp_pct,cm_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order
|
||||||
|
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:]
|
||||||
|
per_specimen: dict[str, tuple[list[int], list[int]]] = {
|
||||||
|
name: ([], []) for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
count = int(parts[i + 1])
|
||||||
|
if count > 0:
|
||||||
|
per_specimen[name][0].append(kmer_int)
|
||||||
|
per_specimen[name][1].append(count)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
count_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
save_cm: Path | None,
|
||||||
|
) -> str:
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
npz = np.load(ref_path)
|
||||||
|
ref_kmers = npz['kmers'] # sorted uint64
|
||||||
|
ref_counts = npz['counts'] # uint32
|
||||||
|
|
||||||
|
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
|
||||||
|
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
|
||||||
|
idx_counts = np.array(count_list, dtype=np.uint32)[order]
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
# Count mismatches among shared kmers
|
||||||
|
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
|
||||||
|
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
|
||||||
|
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
|
||||||
|
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
|
||||||
|
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
|
||||||
|
cm_ref = ref_counts[shared_mask][mismatch_mask]
|
||||||
|
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
|
||||||
|
|
||||||
|
n_shared = int(shared_mask.sum())
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
|
||||||
|
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
if save_cm and len(cm_kmers):
|
||||||
|
cm_file = save_cm / f'{name}_cm.csv'
|
||||||
|
lines = ['kmer,ref_count,idx_count']
|
||||||
|
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
|
||||||
|
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
|
||||||
|
cm_file.write_text('\n'.join(lines) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged count index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory for false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory for false-positive kmer lists')
|
||||||
|
ap.add_argument('--save-cm', metavar='DIR',
|
||||||
|
help='Directory for count-mismatch CSV files')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,count_mismatch,'
|
||||||
|
'fn_pct,fp_pct,cm_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
save_cm = Path(args.save_cm) if args.save_cm else None
|
||||||
|
for d in (save_fn, save_fp, save_cm):
|
||||||
|
if d: d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
kmers, counts = per_specimen[name]
|
||||||
|
row = compare_specimen(name, kmers, counts, ref_dir, k,
|
||||||
|
save_fn, save_fp, save_cm)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_count"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+170
@@ -0,0 +1,170 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Verify the merged presence index against all per-specimen reference sets.
|
||||||
|
|
||||||
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||||||
|
kmer sets from each column, then compares each against its reference .npz.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
|
||||||
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||||||
|
) -> tuple[list[str], dict[str, list[int]]]:
|
||||||
|
"""Stream the merged dump once.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
specimen_names : column labels in dump order (excluding 'kmer')
|
||||||
|
per_specimen : mapping label → list of kmer ints where presence > 0
|
||||||
|
"""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
|
||||||
|
header_line = proc.stdout.readline().rstrip('\n')
|
||||||
|
cols = header_line.split(',')
|
||||||
|
specimen_names = cols[1:] # first col is 'kmer'
|
||||||
|
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
|
||||||
|
|
||||||
|
for line in proc.stdout:
|
||||||
|
parts = line.rstrip('\n').split(',')
|
||||||
|
kmer_int = encode_kmer(parts[0])
|
||||||
|
for i, name in enumerate(specimen_names):
|
||||||
|
if int(parts[i + 1]) > 0:
|
||||||
|
per_specimen[name].append(kmer_int)
|
||||||
|
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return specimen_names, per_specimen
|
||||||
|
|
||||||
|
|
||||||
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare_specimen(name: str,
|
||||||
|
kmer_list: list[int],
|
||||||
|
ref_dir: Path,
|
||||||
|
k: int,
|
||||||
|
save_fn: Path | None,
|
||||||
|
save_fp: Path | None,
|
||||||
|
) -> str:
|
||||||
|
"""Compare one specimen column against its reference .npz.
|
||||||
|
|
||||||
|
Returns a CSV row string.
|
||||||
|
"""
|
||||||
|
ref_path = ref_dir / f'{name}.npz'
|
||||||
|
if not ref_path.exists():
|
||||||
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||||||
|
return ''
|
||||||
|
|
||||||
|
species = name.split('--')[0]
|
||||||
|
strain = name[len(species) + 2:]
|
||||||
|
|
||||||
|
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
|
||||||
|
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
|
||||||
|
|
||||||
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||||||
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||||||
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
|
if save_fn and len(false_neg):
|
||||||
|
fn_file = save_fn / f'{name}_fn.txt'
|
||||||
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||||||
|
|
||||||
|
if save_fp and len(false_pos):
|
||||||
|
fp_file = save_fp / f'{name}_fp.txt'
|
||||||
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||||||
|
|
||||||
|
return (f'{species},{strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||||||
|
help='Merged presence index directory')
|
||||||
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||||||
|
help='Directory containing per-specimen .npz reference files')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer')
|
||||||
|
ap.add_argument('--header', action='store_true',
|
||||||
|
help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fn', metavar='DIR',
|
||||||
|
help='Directory to save false-negative kmer lists')
|
||||||
|
ap.add_argument('--save-fp', metavar='DIR',
|
||||||
|
help='Directory to save false-positive kmer lists')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
ref_dir = Path(args.ref_dir)
|
||||||
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||||||
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||||||
|
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
|
||||||
|
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Detect k
|
||||||
|
out1 = subprocess.check_output(
|
||||||
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||||||
|
stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||||||
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||||||
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||||||
|
|
||||||
|
for name in specimen_names:
|
||||||
|
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
|
||||||
|
if row:
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
Executable
+27
@@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
INDEX="${SCRIPT_DIR}/global_index_presence"
|
||||||
|
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
CURRENT="${STATS_DIR}/current.csv"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
"${INDEX}" "${REF_DIR}" \
|
||||||
|
>>"${CURRENT}"
|
||||||
|
|
||||||
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
|
||||||
|
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
|
||||||
|
cp "${CURRENT}" "${ARCHIVE}"
|
||||||
|
|
||||||
|
echo "Done. Results → ${ARCHIVE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_count.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying count"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+30
@@ -0,0 +1,30 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Usage: verify_one_presence.sh SPECIMEN
|
||||||
|
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||||
|
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SPECIMEN="$1"
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||||
|
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||||
|
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
|
||||||
|
|
||||||
|
species="${SPECIMEN%%--*}"
|
||||||
|
strain="${SPECIMEN#*--}"
|
||||||
|
|
||||||
|
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
|
||||||
|
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||||
|
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
|
||||||
|
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||||
|
|
||||||
|
mkdir -p "${STATS_DIR}"
|
||||||
|
|
||||||
|
echo "[${SPECIMEN}] verifying presence"
|
||||||
|
|
||||||
|
"${PYTHON}" "${VERIFY_PY}" \
|
||||||
|
--obikmer "${BINARY}" \
|
||||||
|
--species "${species}" \
|
||||||
|
--strain "${strain}" \
|
||||||
|
"${REF_NPZ}" "${INDEX_DIR}" \
|
||||||
|
>"${STATS_FILE}"
|
||||||
Executable
+139
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Compare an obikmer index against a reference kmer set (presence/absence).
|
||||||
|
|
||||||
|
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
|
||||||
|
streams the output of `obikmer dump`, encodes each kmer string to uint64,
|
||||||
|
then reports false negatives and false positives using numpy set operations.
|
||||||
|
|
||||||
|
Output to stdout: one CSV row
|
||||||
|
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||||||
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||||||
|
|
||||||
|
_DECODE = ['A', 'C', 'G', 'T']
|
||||||
|
|
||||||
|
|
||||||
|
def encode_kmer(s: str) -> int:
|
||||||
|
kmer = 0
|
||||||
|
for c in s:
|
||||||
|
kmer = (kmer << 2) | _ENCODE[c]
|
||||||
|
return kmer
|
||||||
|
|
||||||
|
|
||||||
|
def decode_kmer(val: int, k: int) -> str:
|
||||||
|
bases = []
|
||||||
|
for _ in range(k):
|
||||||
|
bases.append(_DECODE[val & 3])
|
||||||
|
val >>= 2
|
||||||
|
return ''.join(reversed(bases))
|
||||||
|
|
||||||
|
|
||||||
|
# ── dump parsing ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
|
||||||
|
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
|
||||||
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||||||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||||||
|
text=True)
|
||||||
|
kmers = []
|
||||||
|
header = True
|
||||||
|
for line in proc.stdout:
|
||||||
|
if header:
|
||||||
|
header = False
|
||||||
|
continue
|
||||||
|
kmer_str = line.split(',', 1)[0]
|
||||||
|
kmers.append(encode_kmer(kmer_str))
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode != 0:
|
||||||
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
arr = np.array(kmers, dtype=np.uint64)
|
||||||
|
arr.sort()
|
||||||
|
return arr
|
||||||
|
|
||||||
|
|
||||||
|
# ── comparison ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Return (false_negatives, false_positives) as uint64 arrays."""
|
||||||
|
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
|
||||||
|
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
|
||||||
|
return false_neg, false_pos
|
||||||
|
|
||||||
|
|
||||||
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
|
||||||
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
|
||||||
|
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
|
||||||
|
ap.add_argument('--species', default='', help='Species label for CSV row')
|
||||||
|
ap.add_argument('--strain', default='', help='Strain label for CSV row')
|
||||||
|
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
|
||||||
|
ap.add_argument('--save-fp', metavar='FILE',
|
||||||
|
help='Save false-positive kmer strings to FILE')
|
||||||
|
ap.add_argument('--save-fn', metavar='FILE',
|
||||||
|
help='Save false-negative kmer strings to FILE')
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if args.header:
|
||||||
|
print('species,strain,ref_kmers,idx_kmers,'
|
||||||
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||||||
|
return
|
||||||
|
|
||||||
|
# Detect k from the index (one cheap call before the full dump).
|
||||||
|
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
|
||||||
|
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
|
||||||
|
k = len(out1.splitlines()[1].split(',')[0])
|
||||||
|
|
||||||
|
# Load reference
|
||||||
|
print(f'Loading reference: {args.reference}', file=sys.stderr)
|
||||||
|
npz = np.load(args.reference)
|
||||||
|
ref_kmers = npz['kmers'] # already sorted uint64
|
||||||
|
|
||||||
|
# Load index
|
||||||
|
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
|
||||||
|
idx_kmers = load_index_kmers(args.obikmer, args.index)
|
||||||
|
|
||||||
|
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
|
||||||
|
|
||||||
|
false_neg, false_pos = compare(ref_kmers, idx_kmers)
|
||||||
|
|
||||||
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||||||
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||||||
|
|
||||||
|
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
|
||||||
|
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fn and len(false_neg):
|
||||||
|
with open(args.save_fn, 'w') as fh:
|
||||||
|
for v in false_neg:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
|
||||||
|
|
||||||
|
if args.save_fp and len(false_pos):
|
||||||
|
with open(args.save_fp, 'w') as fh:
|
||||||
|
for v in false_pos:
|
||||||
|
fh.write(decode_kmer(int(v), k) + '\n')
|
||||||
|
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
|
||||||
|
|
||||||
|
print(f'{args.species},{args.strain},'
|
||||||
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||||||
|
f'{len(false_neg)},{len(false_pos)},'
|
||||||
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate.
|
|||||||
|
|
||||||
### Path matching (`~` and `!~`)
|
### Path matching (`~` and `!~`)
|
||||||
|
|
||||||
Metadata values can represent hierarchical taxonomic paths such as
|
Metadata values can represent hierarchical concept paths such as
|
||||||
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
|
||||||
|
|
||||||
- **Absolute pattern** (starts with `/`): the value must start with the pattern
|
**Both the stored metadata value and the pattern must start with `/`.**
|
||||||
at a segment boundary.
|
A pattern that does not start with `/` is rejected at parse time with an error.
|
||||||
`taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
|
|
||||||
|
The value matches the pattern if it equals it exactly or starts with the pattern
|
||||||
|
followed by `/` (segment-boundary prefix):
|
||||||
|
|
||||||
|
- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
|
||||||
`/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
|
`/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
|
||||||
- **Bare segment** (no leading `/`): the value must contain the pattern as an
|
|
||||||
exact path component anywhere.
|
|
||||||
`taxon~Betula` matches any path that has `Betula` as one of its segments.
|
|
||||||
|
|
||||||
### Missing metadata key → NA
|
### Missing metadata key → NA
|
||||||
|
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ nav:
|
|||||||
- Merge parallelism & memory: implementation/merge_parallelism.md
|
- Merge parallelism & memory: implementation/merge_parallelism.md
|
||||||
- Kmer filtering: implementation/filtering.md
|
- Kmer filtering: implementation/filtering.md
|
||||||
- Select command: implementation/select.md
|
- Select command: implementation/select.md
|
||||||
|
- obitaxonomy crate: implementation/obitaxonomy.md
|
||||||
- Architecture:
|
- Architecture:
|
||||||
- Sequences: architecture/sequences/invariant.md
|
- Sequences: architecture/sequences/invariant.md
|
||||||
- Kmer index: architecture/index_architecture.md
|
- Kmer index: architecture/index_architecture.md
|
||||||
|
|||||||
Generated
+4
@@ -1853,6 +1853,10 @@ dependencies = [
|
|||||||
"tracing",
|
"tracing",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "obitaxonomy"
|
||||||
|
version = "0.1.0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "object"
|
name = "object"
|
||||||
version = "0.37.3"
|
version = "0.37.3"
|
||||||
|
|||||||
+1
-1
@@ -1,5 +1,5 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
resolver = "3"
|
resolver = "3"
|
||||||
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
|
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = 1
|
debug = 1
|
||||||
|
|||||||
@@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
|
|||||||
// ── BitIter ───────────────────────────────────────────────────────────────────
|
// ── BitIter ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
pub struct BitIter<'a> {
|
pub struct BitIter<'a> {
|
||||||
pub(crate) words: &'a [u64],
|
words: &'a [u64],
|
||||||
pub(crate) slot: usize,
|
slot: usize,
|
||||||
pub(crate) n: usize,
|
n: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExactSizeIterator for BitIter<'_> {}
|
impl ExactSizeIterator for BitIter<'_> {}
|
||||||
@@ -132,7 +132,7 @@ impl PersistentBitVecBuilder {
|
|||||||
Ok(Self { mmap, n, path: path.to_path_buf() })
|
Ok(Self { mmap, n, path: path.to_path_buf() })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
|
||||||
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
let file_size = HEADER_SIZE + n_bytes_for_words(n);
|
||||||
let file = OpenOptions::new()
|
let file = OpenOptions::new()
|
||||||
.read(true).write(true).create(true).truncate(true)
|
.read(true).write(true).create(true).truncate(true)
|
||||||
|
|||||||
@@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder;
|
|||||||
pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
|
pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
|
||||||
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
|
||||||
pub use layer_meta::LayerMeta;
|
pub use layer_meta::LayerMeta;
|
||||||
pub use reader::PersistentCompactIntVec;
|
pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
|
||||||
pub use tempbitvec::TempBitVec;
|
pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
|
||||||
pub use tempintvec::TempCompactIntVec;
|
pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
|
||||||
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
pub use traits::{BitPartials, ColumnWeights, CountPartials};
|
||||||
pub use views::{BitSliceView, IntSliceView};
|
pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[path = "tests/mod.rs"]
|
#[path = "tests/mod.rs"]
|
||||||
|
|||||||
@@ -43,27 +43,27 @@ impl TempBitVec {
|
|||||||
|
|
||||||
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
|
||||||
|
|
||||||
pub(crate) struct TempBitVecBuilder {
|
pub struct TempBitVecBuilder {
|
||||||
builder: PersistentBitVecBuilder,
|
builder: PersistentBitVecBuilder,
|
||||||
temp: TempDir,
|
temp: TempDir,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TempBitVecBuilder {
|
impl TempBitVecBuilder {
|
||||||
pub(crate) fn new(n: usize) -> io::Result<Self> {
|
pub fn new(n: usize) -> io::Result<Self> {
|
||||||
let temp = TempDir::new()?;
|
let temp = TempDir::new()?;
|
||||||
let path = temp.path().join("data.pbiv");
|
let path = temp.path().join("data.pbiv");
|
||||||
let builder = PersistentBitVecBuilder::new(n, &path)?;
|
let builder = PersistentBitVecBuilder::new(n, &path)?;
|
||||||
Ok(Self { builder, temp })
|
Ok(Self { builder, temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
|
pub fn new_ones(n: usize) -> io::Result<Self> {
|
||||||
let temp = TempDir::new()?;
|
let temp = TempDir::new()?;
|
||||||
let path = temp.path().join("data.pbiv");
|
let path = temp.path().join("data.pbiv");
|
||||||
let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
|
let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
|
||||||
Ok(Self { builder, temp })
|
Ok(Self { builder, temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
|
pub fn freeze(self) -> io::Result<TempBitVec> {
|
||||||
let Self { builder, temp } = self;
|
let Self { builder, temp } = self;
|
||||||
let vec = builder.finish()?;
|
let vec = builder.finish()?;
|
||||||
Ok(TempBitVec { vec, _temp: temp })
|
Ok(TempBitVec { vec, _temp: temp })
|
||||||
@@ -72,7 +72,8 @@ impl TempBitVecBuilder {
|
|||||||
pub fn set(&mut self, slot: usize, value: bool) {
|
pub fn set(&mut self, slot: usize, value: bool) {
|
||||||
self.builder.set(slot, value);
|
self.builder.set(slot, value);
|
||||||
}
|
}
|
||||||
pub(crate) fn view(&self) -> BitSliceView<'_> {
|
|
||||||
|
pub fn view(&self) -> BitSliceView<'_> {
|
||||||
self.builder.view()
|
self.builder.view()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,19 +81,19 @@ impl TempBitVecBuilder {
|
|||||||
self.builder.or(other);
|
self.builder.or(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
|
pub fn and(&mut self, other: BitSliceView<'_>) {
|
||||||
self.builder.and(other);
|
self.builder.and(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
|
pub fn xor(&mut self, other: BitSliceView<'_>) {
|
||||||
self.builder.xor(other);
|
self.builder.xor(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn not(&mut self) {
|
pub fn not(&mut self) {
|
||||||
self.builder.not();
|
self.builder.not();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
|
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
|
||||||
self.builder.copy_from(src);
|
self.builder.copy_from(src);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,11 +101,11 @@ impl TempBitVecBuilder {
|
|||||||
self.builder.or_where(col, pred);
|
self.builder.or_where(col, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
self.builder.and_where(col, pred);
|
self.builder.and_where(col, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
self.builder.xor_where(col, pred);
|
self.builder.xor_where(col, pred);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,60 +32,58 @@ impl TempCompactIntVec {
|
|||||||
|
|
||||||
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
|
||||||
|
|
||||||
pub(crate) struct TempCompactIntVecBuilder {
|
pub struct TempCompactIntVecBuilder {
|
||||||
builder: PersistentCompactIntVecBuilder,
|
builder: PersistentCompactIntVecBuilder,
|
||||||
temp: TempDir,
|
temp: TempDir,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TempCompactIntVecBuilder {
|
impl TempCompactIntVecBuilder {
|
||||||
pub(crate) fn new(n: usize) -> io::Result<Self> {
|
pub fn new(n: usize) -> io::Result<Self> {
|
||||||
let temp = TempDir::new()?;
|
let temp = TempDir::new()?;
|
||||||
let path = temp.path().join("data.pciv");
|
let path = temp.path().join("data.pciv");
|
||||||
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
|
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
|
||||||
Ok(Self { builder, temp })
|
Ok(Self { builder, temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
|
pub fn freeze(self) -> io::Result<TempCompactIntVec> {
|
||||||
let Self { builder, temp } = self;
|
let Self { builder, temp } = self;
|
||||||
let vec = builder.finish()?;
|
let vec = builder.finish()?;
|
||||||
Ok(TempCompactIntVec { vec, _temp: temp })
|
Ok(TempCompactIntVec { vec, _temp: temp })
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Delegation methods ────────────────────────────────────────────────────
|
pub fn n(&self) -> usize { self.builder.len() }
|
||||||
|
|
||||||
pub(crate) fn n(&self) -> usize { self.builder.len() }
|
pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
||||||
|
pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
||||||
|
|
||||||
pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
|
pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
||||||
pub(crate) fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
|
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
||||||
|
|
||||||
pub(crate) fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
|
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
|
||||||
pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
|
|
||||||
|
|
||||||
pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
|
|
||||||
self.builder.inc_present(col);
|
self.builder.inc_present(col);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
|
||||||
self.builder.inc_present_fast(col);
|
self.builder.inc_present_fast(col);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
self.builder.inc_predicate(col, pred);
|
self.builder.inc_predicate(col, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
|
||||||
self.builder.inc_predicate_fast(col, pred);
|
self.builder.inc_predicate_fast(col, pred);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
|
pub fn add(&mut self, other: IntSliceView<'_>) {
|
||||||
self.builder.add(other);
|
self.builder.add(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
|
||||||
self.builder.mask_with(mask);
|
self.builder.mask_with(mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
|
pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
|
||||||
pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
|
pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
|
||||||
pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
|
pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use crossbeam_channel;
|
|||||||
use hashbrown::HashMap;
|
use hashbrown::HashMap;
|
||||||
use obikseq::k;
|
use obikseq::k;
|
||||||
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
use obikseq::{CanonicalKmer, Sequence, Unitig};
|
||||||
|
#[cfg(not(any(test, feature = "test-utils")))]
|
||||||
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
|
||||||
use std::cell::RefCell;
|
use std::cell::RefCell;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
|
|||||||
use crate::error::{OKIError, OKIResult};
|
use crate::error::{OKIError, OKIResult};
|
||||||
use crate::index::KmerIndex;
|
use crate::index::KmerIndex;
|
||||||
use crate::meta::{GenomeInfo, IndexMeta};
|
use crate::meta::{GenomeInfo, IndexMeta};
|
||||||
use crate::state::IndexState;
|
use crate::state::{IndexState, SENTINEL_INDEXED};
|
||||||
|
|
||||||
pub use obikpartitionner::MergeMode;
|
pub use obikpartitionner::MergeMode;
|
||||||
|
|
||||||
@@ -263,6 +263,8 @@ impl KmerIndex {
|
|||||||
rep.push(t.stop());
|
rep.push(t.stop());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
|
||||||
|
|
||||||
KmerIndex::open(output)
|
KmerIndex::open(output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,6 +49,11 @@ impl MetaPred {
|
|||||||
if values.iter().any(|v| v.is_empty()) {
|
if values.iter().any(|v| v.is_empty()) {
|
||||||
return Err(format!("empty value in predicate: {s}"));
|
return Err(format!("empty value in predicate: {s}"));
|
||||||
}
|
}
|
||||||
|
if matches!(op, PredOp::Matches | PredOp::NotMatches) {
|
||||||
|
if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
|
||||||
|
return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(Self { key, op, values })
|
Ok(Self { key, op, values })
|
||||||
}
|
}
|
||||||
@@ -72,16 +77,12 @@ impl MetaPred {
|
|||||||
|
|
||||||
/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
|
/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
|
||||||
///
|
///
|
||||||
/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
|
/// Both `value` and `pattern` must start with `/`.
|
||||||
/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
|
/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
|
||||||
fn path_matches(value: &str, pattern: &str) -> bool {
|
fn path_matches(value: &str, pattern: &str) -> bool {
|
||||||
if pattern.starts_with('/') {
|
value == pattern
|
||||||
value == pattern
|
|| (value.starts_with(pattern)
|
||||||
|| (value.starts_with(pattern)
|
&& value[pattern.len()..].starts_with('/'))
|
||||||
&& value[pattern.len()..].starts_with('/'))
|
|
||||||
} else {
|
|
||||||
value.split('/').any(|seg| seg == pattern)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Three-value group evaluation ──────────────────────────────────────────────
|
// ── Three-value group evaluation ──────────────────────────────────────────────
|
||||||
|
|||||||
@@ -1,28 +0,0 @@
|
|||||||
>F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
|
|
||||||
GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
|
|
||||||
>C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
|
|
||||||
AAAAAAATTAAGTCCCATATTGAAACATTAT
|
|
||||||
>9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
|
|
||||||
AAAAAATTAAGTCCCATATTGAAACATTATC
|
|
||||||
>74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
|
|
||||||
AAAAATTAAGTCCCATATTGAAACATTATCA
|
|
||||||
>45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
|
|
||||||
AAAATTAAGTCCCATATTGAAACATTATCAC
|
|
||||||
>5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
|
|
||||||
AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
|
|
||||||
>8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
|
|
||||||
AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
|
|
||||||
>C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
|
|
||||||
AATGTGAGTTGTTAATATTACATAATTGGGTT
|
|
||||||
>933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
|
|
||||||
TGTGAGTTGTTAATATTACATAATTGGGTTT
|
|
||||||
>3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
|
|
||||||
GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
|
|
||||||
>1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
|
|
||||||
TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
|
|
||||||
>8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
|
|
||||||
AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
|
|
||||||
>247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
|
|
||||||
CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
|
|
||||||
>11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
|
|
||||||
GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG
|
|
||||||
Reference in New Issue
Block a user