Push mtzqmmrlmzzx #34

Merged
coissac merged 25 commits from push-mtzqmmrlmzzx into main 2026-06-22 08:47:24 +00:00
42 changed files with 2585 additions and 84 deletions
Showing only changes of commit c694e1f2b0 - Show all commits
Vendored
BIN
View File
Binary file not shown.
+10
View File
@@ -9,3 +9,13 @@ data-stress
./**/*.json
*.bin
Betula_exilis--IGA-24-33
benchmark/genomes
benchmark/simulated_data
benchmark/specimen_index_presence
benchmark/specimen_index_count
benchmark/global_index_presence
benchmark/global_index_count
benchmark/stats
benchmark/reference_index
benchmark/specific_index_count
benchmark/specific_index_presence
+2
View File
@@ -0,0 +1,2 @@
/cache
/project.local.yml
+133
View File
@@ -0,0 +1,133 @@
# the name by which the project can be referenced within Serena
project_name: "obikmer"
# list of languages for which language servers are started; choose from:
# al angular ansible bash clojure
# cpp cpp_ccls crystal csharp csharp_omnisharp
# dart elixir elm erlang fortran
# fsharp go groovy haskell haxe
# hlsl html java json julia
# kotlin lean4 lua luau markdown
# matlab msl nix ocaml pascal
# perl php php_phpactor powershell python
# python_jedi python_ty r rego ruby
# ruby_solargraph rust scala scss solidity
# svelte swift systemverilog terraform toml
# typescript typescript_vts vue yaml zig
# (This list may be outdated. For the current list, see values of Language enum here:
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
# Note:
# - For C, use cpp
# - For JavaScript, use typescript
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
# - For Free Pascal/Lazarus, use pascal
# Special requirements:
# Some languages require additional setup/installations.
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
# When using multiple languages, the first language server that supports a given file will be used for that file.
# The first language is the default language and the respective language server will be used as a fallback.
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
languages:
- rust
# the encoding used by text files in the project
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
encoding: "utf-8"
# line ending convention to use when writing source files.
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
line_ending:
# The language backend to use for this project.
# If not set, the global setting from serena_config.yml is used.
# Valid values: LSP, JetBrains
# Note: the backend is fixed at startup. If a project with a different backend
# is activated post-init, an error will be returned.
language_backend:
# whether to use project's .gitignore files to ignore files
ignore_all_files_in_gitignore: true
# advanced configuration option allowing to configure language server-specific options.
# Maps the language key to the options.
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
# No documentation on options means no options are available.
ls_specific_settings: {}
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
# Paths can be absolute or relative to the project root.
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
# symbols and references across package boundaries.
# Currently supported for: TypeScript.
# Example:
# additional_workspace_folders:
# - ../sibling-package
# - ../shared-lib
additional_workspace_folders: []
# list of additional paths to ignore in this project.
# Same syntax as gitignore, so you can use * and **.
# Note: global ignored_paths from serena_config.yml are also applied additively.
ignored_paths: []
# whether the project is in read-only mode
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
# Added on 2025-04-18
read_only: false
# list of tool names to exclude.
# This extends the existing exclusions (e.g. from the global configuration)
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
excluded_tools: []
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
# This extends the existing inclusions (e.g. from the global configuration).
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
included_optional_tools: []
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
fixed_tools: []
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
# for this project.
# This setting can, in turn, be overridden by CLI parameters (--mode).
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
default_modes:
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
added_modes:
# initial prompt for the project. It will always be given to the LLM upon activating the project
# (contrary to the memories, which are loaded on demand).
initial_prompt: ""
# time budget (seconds) per tool call for the retrieval of additional symbol information
# such as docstrings or parameter information.
# This overrides the corresponding setting in the global configuration; see the documentation there.
# If null or missing, use the setting from the global configuration.
symbol_info_budget:
# list of regex patterns which, when matched, mark a memory entry as readonly.
# Extends the list from the global configuration, merging the two lists.
read_only_memory_patterns: []
# list of regex patterns for memories to completely ignore.
# Matching memories will not appear in list_memories or activate_project output
# and cannot be accessed via read_memory or write_memory.
# To access ignored memory files, use the read_file tool on the raw file path.
# Extends the list from the global configuration, merging the two lists.
# Example: ["_archive/.*", "_episodes/.*"]
ignored_memory_patterns: []
+26
View File
@@ -73,3 +73,29 @@ Lors de l'ajout de nouveaux fichiers Markdown dans `docmd/`, mettre à jour la s
---
Je continue à poser mes questions et à guider la discussion.
---
## MCP Tools
**Règle absolue : avant tout travail de code, appeler `mcp__serena__initial_instructions` pour charger les instructions Serena.**
### Hiérarchie des outils pour ce projet Rust
**Navigation et édition de code → serena en priorité**
- Trouver un symbole, une déclaration, les implémentations d'un trait : `mcp__serena__find_symbol`, `mcp__serena__find_declaration`, `mcp__serena__find_implementations`
- Trouver les usages d'un symbole : `mcp__serena__find_referencing_symbols`
- Diagnostics LSP (erreurs de compilation) : `mcp__serena__get_diagnostics_for_file`
- Vue d'ensemble d'un fichier : `mcp__serena__get_symbols_overview`
- Modifier le corps d'une fonction/impl : `mcp__serena__replace_symbol_body`
- Ne pas utiliser `cclsp` quand serena couvre le besoin
**Analyse architecturale → jcodemunch**
- Hotspots, couplage, dead code, dépendances entre modules
- Utiliser avant de refactorer une zone critique
**Raisonnement complexe → sequential-thinking**
- Décisions d'architecture, choix d'algorithme, trade-offs non triviaux
**Documentation de crates → context7**
- Toujours consulter avant d'utiliser une API de bibliothèque externe
+1
View File
@@ -22,6 +22,7 @@ $(MKDOCS): $(VENV)/bin/activate
mkdocs mkdocs-material \
mkdocs-mermaid2-plugin \
mkdocs-bibtex
$(PIP) install --quiet --upgrade InSilicoSeq
# ── obikmer binary ───────────────────────────────────────────────────────────
+144
View File
@@ -0,0 +1,144 @@
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
BINARY := ../src/target/release/obikmer
VENV_PY := ../.venv/bin/python3
GENOMES := $(wildcard genomes/*.fna.gz)
# SPECIMENS, SPECIES, and the full dependency graph are generated by
# make_deps.py from the genome FASTA headers — like .d files in C.
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
-include deps.mk
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
.NOTPARALLEL:
.PHONY: all simulate reference \
index_presence index_count \
aggregate_index_presence aggregate_index_count \
merge_presence merge_count \
verify_presence verify_count \
aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
filter_presence filter_count \
aggregate_filter_presence aggregate_filter_count
verify_merge_presence: stats/verify_merge_presence/current.csv
verify_merge_count: stats/verify_merge_count/current.csv
all: aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
aggregate_filter_presence aggregate_filter_count
# ── dependency file ───────────────────────────────────────────────────────────
deps.mk: $(GENOMES)
$(VENV_PY) make_deps.py $^ > $@
# ── simulation ────────────────────────────────────────────────────────────────
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
$(SIMULATED_READS):
bash simulate_one.sh $< $(dir $@)
simulate: $(SIMULATED_READS)
# ── reference kmer sets ───────────────────────────────────────────────────────
# Prerequisites (reads → npz) are in deps.mk.
reference_index/%.npz:
bash build_reference.sh $*
reference: $(REF_NPZS)
# ── per-specimen indexing ─────────────────────────────────────────────────────
# Prerequisites (reads → index.done + .stats) are in deps.mk.
specimen_index_presence/%/index.done \
stats/indexing_presence/%.stats &: $(BINARY)
bash index_one_presence.sh $*
specimen_index_count/%/index.done \
stats/indexing_count/%.stats &: $(BINARY)
bash index_one_count.sh $*
index_presence: $(PRESENCE_DONE)
index_count: $(COUNT_DONE)
# ── indexing stats aggregation ────────────────────────────────────────────────
aggregate_index_presence: $(PRESENCE_STATS)
bash aggregate_stats.sh indexing_presence
aggregate_index_count: $(COUNT_STATS)
bash aggregate_stats.sh indexing_count
# ── global merge ──────────────────────────────────────────────────────────────
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
bash merge_presence.sh
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
bash merge_count.sh
merge_presence: global_index_presence/index.done
merge_count: global_index_count/index.done
# ── per-specimen verification ─────────────────────────────────────────────────
# Prerequisites (index.done + npz → .stats) are in deps.mk.
stats/verify_presence/%.stats:
bash verify_one_presence.sh $*
stats/verify_count/%.stats:
bash verify_one_count.sh $*
verify_presence: $(VERIFY_PRESENCE_STATS)
verify_count: $(VERIFY_COUNT_STATS)
# ── verification stats aggregation ───────────────────────────────────────────
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
bash aggregate_stats.sh verify_presence
aggregate_verify_count: $(VERIFY_COUNT_STATS)
bash aggregate_stats.sh verify_count
# ── species-specific indexes ──────────────────────────────────────────────────
# Prerequisites (global index → specific index) are in deps.mk.
specific_index_presence/%/index.done \
stats/specific_kmer_presence/%.stats &: $(BINARY)
bash filter_one_presence.sh $*
specific_index_count/%/index.done \
stats/specific_kmer_count/%.stats &: $(BINARY)
bash filter_one_count.sh $*
filter_presence: $(SPECIFIC_PRESENCE_DONE)
filter_count: $(SPECIFIC_COUNT_DONE)
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
bash aggregate_stats.sh specific_kmer_presence
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
bash aggregate_stats.sh specific_kmer_count
# ── merged index verification ─────────────────────────────────────────────────
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
bash verify_merge_presence.sh
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
bash verify_merge_count.sh
+132
View File
@@ -0,0 +1,132 @@
# Benchmark pipeline
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
```
gmake all # full pipeline
gmake simulate # simulation only
gmake reference # reference kmer sets only
```
## Pipeline overview
```mermaid
flowchart TD
GENOMES["genomes/*.fna.gz"]
BIN["obikmer binary"]
GENOMES --> simulate
simulate --> simdata[("simulated_data/")]
simdata --> reference
reference --> refnpz[("reference_index/*.npz")]
subgraph presence ["Presence track"]
simdata --> index_presence
BIN --> index_presence
index_presence --> pres_done[("specimen_index_presence/")]
index_presence --> pres_istats[("stats/indexing_presence/")]
pres_istats --> aggregate_index_presence
pres_done --> merge_presence
BIN --> merge_presence
merge_presence --> gpres[("global_index_presence/")]
refnpz --> verify_presence
pres_done --> verify_presence
verify_presence --> vpres_stats[("stats/verify_presence/")]
vpres_stats --> aggregate_verify_presence
gpres --> filter_presence
BIN --> filter_presence
filter_presence --> spec_pres[("specific_index_presence/")]
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
spec_pres_stats --> aggregate_filter_presence
refnpz --> verify_merge_presence
gpres --> verify_merge_presence
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
end
subgraph count ["Count track"]
simdata --> index_count
BIN --> index_count
index_count --> count_done[("specimen_index_count/")]
index_count --> count_istats[("stats/indexing_count/")]
count_istats --> aggregate_index_count
count_done --> merge_count
BIN --> merge_count
merge_count --> gcount[("global_index_count/")]
refnpz --> verify_count
count_done --> verify_count
verify_count --> vcount_stats[("stats/verify_count/")]
vcount_stats --> aggregate_verify_count
gcount --> filter_count
BIN --> filter_count
filter_count --> spec_count[("specific_index_count/")]
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
spec_count_stats --> aggregate_filter_count
refnpz --> verify_merge_count
gcount --> verify_merge_count
verify_merge_count --> vmc[("stats/verify_merge_count/")]
end
aggregate_verify_presence --> all
aggregate_verify_count --> all
vmp --> all
vmc --> all
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
```
## Steps
| Target | Script | Description |
|---|---|---|
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
## Directory layout
```
benchmark/
├── genomes/ # input reference genomes (.fna.gz)
├── simulated_data/ # generated by simulate
│ └── <species>/<specimen>/
├── reference_index/ # reference kmer sets (.npz)
├── specimen_index_presence/ # per-specimen presence indexes
├── specimen_index_count/ # per-specimen count indexes
├── global_index_presence/ # merged global presence index
├── global_index_count/ # merged global count index
├── specific_index_presence/ # species-specific presence indexes
├── specific_index_count/ # species-specific count indexes
└── stats/ # all benchmark statistics
├── indexing_presence/
├── indexing_count/
├── verify_presence/
├── verify_count/
├── specific_kmer_presence/
├── specific_kmer_count/
├── verify_merge_presence/
└── verify_merge_count/
```
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Usage: aggregate_stats.sh TYPE
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
#
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
# the most recent run CSV (idempotent when nothing changed).
set -euo pipefail
TYPE="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
case "${TYPE}" in
indexing_presence|indexing_count)
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
;;
verify_presence)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
;;
verify_count)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
;;
specific_kmer_presence|specific_kmer_count)
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
;;
*)
echo "ERROR: unknown stats type '${TYPE}'" >&2
exit 1
;;
esac
# Find most recent existing run CSV (empty string if none).
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
# Check if any .stats file is newer than the latest run CSV.
if [[ -n "${latest_csv}" ]] && \
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
echo "[${TYPE}] stats up to date (${latest_csv})"
exit 0
fi
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
echo "${HEADER}" >"${CSV}"
# Sort .stats files by name for reproducible row order.
while IFS= read -r stats_file; do
sed "s/^/${run_n},/" "${stats_file}"
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
echo "[${TYPE}] run ${run_n}${CSV}"
+137
View File
@@ -0,0 +1,137 @@
#!/usr/bin/env python3
"""Build a reference kmer index from paired-end FASTQ reads.
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
counts their abundances, and saves a sorted numpy pair (kmers, counts).
Output .npz arrays
kmers : uint64, sorted ascending — canonical kmer integers
counts : uint32, same order — raw read abundances
"""
import argparse
import gzip
import sys
from collections import defaultdict
import numpy as np
# ── encoding ────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
# Lookup table: revcomp of one byte (4 bases, 8 bits).
# Precomputed once at import time.
_REVCOMP8 = [0] * 256
for _i in range(256):
_rc, _x = 0, _i
for _ in range(4):
_rc = (_rc << 2) | (3 - (_x & 3))
_x >>= 2
_REVCOMP8[_i] = _rc
del _i, _rc, _x
def revcomp_int(kmer: int, k: int) -> int:
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
Uses byte-level lookup (4 bases at a time) for speed.
"""
rc = 0
bits_left = 2 * k
while bits_left > 0:
chunk = min(8, bits_left)
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
rc = (rc << chunk) | rc_byte
kmer >>= chunk
bits_left -= chunk
return rc
# ── FASTQ parsing ────────────────────────────────────────────────────────────
def iter_sequences(path: str):
"""Yield raw sequences from a (gzipped) FASTQ file."""
opener = gzip.open if path.endswith('.gz') else open
with opener(path, 'rt') as fh:
while True:
if not fh.readline(): # '@' header
break
seq = fh.readline().rstrip('\n')
fh.readline() # '+'
fh.readline() # quality
yield seq
# ── kmer counting ────────────────────────────────────────────────────────────
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
mask = (1 << (2 * k)) - 1
counts: dict[int, int] = defaultdict(int)
n_reads = 0
for path in paths:
for seq in iter_sequences(path):
n_reads += 1
kmer = 0
run = 0 # consecutive valid bases
for c in seq:
b = _ENCODE.get(c)
if b is None: # N or unexpected character → reset
kmer = 0
run = 0
continue
kmer = ((kmer << 2) | b) & mask
run += 1
if run >= k:
rc = revcomp_int(kmer, k)
counts[kmer if kmer <= rc else rc] += 1
if n_reads % 100_000 == 0:
print(f' {n_reads:,} reads processed, '
f'{len(counts):,} distinct kmers so far',
file=sys.stderr)
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
file=sys.stderr)
return counts
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reads', nargs='+', metavar='FASTQ',
help='Input reads (FASTQ, gzip OK)')
ap.add_argument('-k', '--kmer-size', type=int, default=31,
metavar='K')
ap.add_argument('--min-abundance', type=int, default=1,
metavar='N', help='Drop kmers with count < N (default 1)')
ap.add_argument('-o', '--output', required=True,
metavar='FILE', help='Output .npz path')
args = ap.parse_args()
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
counts = count_kmers(args.reads, args.kmer_size)
if args.min_abundance > 1:
before = len(counts)
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
print(f' min-abundance={args.min_abundance}: '
f'{before - len(counts):,} kmers dropped, '
f'{len(counts):,} retained',
file=sys.stderr)
print(f'Sorting and saving → {args.output}', file=sys.stderr)
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
if __name__ == '__main__':
main()
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
REF_DIR="${SCRIPT_DIR}/reference_index"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
KMER_SIZE="${KMER_SIZE:-31}"
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
mkdir -p "${REF_DIR}"
for species_dir in "${SIMDATA_DIR}"/*/; do
[[ -d "${species_dir}" ]] || continue
species=$(basename "${species_dir}")
for strain_dir in "${species_dir}"*/; do
[[ -d "${strain_dir}" ]] || continue
strain=$(basename "${strain_dir}")
r1="${strain_dir}/reads_R1.fastq.gz"
r2="${strain_dir}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "SKIP ${species}--${strain}: reads not found" >&2
continue
fi
out="${REF_DIR}/${species}--${strain}.npz"
echo "[${species}--${strain}] → ${out}"
"${PYTHON}" "${BUILD_PY}" \
--kmer-size "${KMER_SIZE}" \
--min-abundance "${MIN_ABUNDANCE}" \
--output "${out}" \
"${r1}" "${r2}"
done
done
+199
View File
@@ -0,0 +1,199 @@
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
# Escherichia_coli--K-12_MG1655
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
# Escherichia_coli--EDL933
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
# Salmonella_enterica--LT2
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
# Escherichia_coli--CFT073
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
# Bacillus_subtilis--168
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
# Salmonella_enterica--P125109
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
# Shouchella_clausii--KSM-K16
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
# Escherichia_coli--K-12_W3110
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
# Klebsiella_pneumoniae--MGH_78578
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
# Opitutus_terrae--PB90-1
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
# Saccharolobus_islandicus--M.16.4
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
# Acidobacterium_capsulatum--ATCC_51196
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
# Salmonella_enterica--AKU_12601
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
# Proteus_mirabilis--HI4320
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
# Salmonella_enterica--CT18
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
# Klebsiella_pneumoniae--HS11286
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
# Klebsiella_pneumoniae--ATCC_13883
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
# Yersinia_ruckeri--YRB
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
# Escherichia_coli
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
# Salmonella_enterica
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
# Bacillus_subtilis
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
# Shouchella_clausii
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
# Klebsiella_pneumoniae
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
# Opitutus_terrae
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
# Saccharolobus_islandicus
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
# Acidobacterium_capsulatum
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
# Proteus_mirabilis
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
# Wolbachia_endosymbiont
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
# Yersinia_ruckeri
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
# Candidozyma_auris
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
+48
View File
@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -euo pipefail
assemblies=(
GCF_000005845.2
GCF_000010245.2
GCF_000007445.1
GCF_000006665.1
GCF_000006945.2
GCF_000195995.1
GCF_000009505.1
GCF_000026565.1
GCF_000016305.1
GCF_000019965.1
GCF_000240185.1
GCF_000742135.1
GCF_000069965.1
GCF_000022565.1
GCF_000306885.1
GCF_003013715.1
GCF_000009045.1
GCF_000009825.1
GCF_000022445.1
GCF_000834255.1
)
mkdir -p genomes
for acc in "${assemblies[@]}"; do
echo "Downloading ${acc}"
datasets download genome accession "${acc}" \
--include genome \
--filename "${acc}.zip"
unzip -q "${acc}.zip" -d "${acc}"
find "${acc}" -name "*.fna" |
while read file; do
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
done
rm -rf "${acc}" "${acc}.zip"
done
+108
View File
@@ -0,0 +1,108 @@
#!/usr/bin/env bash
# Usage: filter_one_count.sh SPECIES
# Filters global_index_count to keep only kmers specific to SPECIES,
# then selects the SPECIES column in-place.
# Outputs:
# specific_index_count/SPECIES/index.done (written by obikmer select)
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
set -euo pipefail
SPECIES="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
SOURCE="${SCRIPT_DIR}/global_index_count"
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
LOG_FILTER=$(mktemp)
LOG_SELECT=$(mktemp)
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
"${BINARY}" filter \
--output "${OUTPUT}" \
--force \
--ingroup "species=${SPECIES}" \
--outgroup all \
--min-frac 0.5 \
--max-frac 1.0 \
--max-outgroup-count 0 \
"${SOURCE}" \
2>"${LOG_FILTER}"
cat "${LOG_FILTER}" >&2
"${BINARY}" select \
--in-place \
--group "${SPECIES}:species=${SPECIES}" \
--group-op "${SPECIES}:any" \
--select "${SPECIES}" \
"${OUTPUT}" \
2>"${LOG_SELECT}"
cat "${LOG_SELECT}" >&2
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
def parse_reporter(logfile):
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats['TOTAL'] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
return stats
f = parse_reporter(log_filter)
s = parse_reporter(log_select)
row = [species]
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
key = 'TOTAL' if stage.endswith('_total') else stage
w, r = d.get(key, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
print(','.join(row))
PYEOF
+108
View File
@@ -0,0 +1,108 @@
#!/usr/bin/env bash
# Usage: filter_one_presence.sh SPECIES
# Filters global_index_presence to keep only kmers specific to SPECIES,
# then selects the SPECIES column in-place.
# Outputs:
# specific_index_presence/SPECIES/index.done (written by obikmer select)
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
set -euo pipefail
SPECIES="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
SOURCE="${SCRIPT_DIR}/global_index_presence"
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
LOG_FILTER=$(mktemp)
LOG_SELECT=$(mktemp)
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
"${BINARY}" filter \
--output "${OUTPUT}" \
--force \
--ingroup "species=${SPECIES}" \
--outgroup all \
--min-frac 0.5 \
--max-frac 1.0 \
--max-outgroup-count 0 \
"${SOURCE}" \
2>"${LOG_FILTER}"
cat "${LOG_FILTER}" >&2
"${BINARY}" select \
--in-place \
--group "${SPECIES}:species=${SPECIES}" \
--group-op "${SPECIES}:any" \
--select "${SPECIES}" \
"${OUTPUT}" \
2>"${LOG_SELECT}"
cat "${LOG_SELECT}" >&2
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
def parse_reporter(logfile):
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats['TOTAL'] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
return stats
f = parse_reporter(log_filter)
s = parse_reporter(log_select)
row = [species]
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
key = 'TOTAL' if stage.endswith('_total') else stage
w, r = d.get(key, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
print(','.join(row))
PYEOF
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
# Usage: index_one_count.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Outputs:
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
r1="${READS_DIR}/reads_R1.fastq.gz"
r2="${READS_DIR}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "ERROR: reads not found in ${READS_DIR}" >&2
exit 1
fi
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" index \
--output "${INDEX_PATH}" \
--force \
--theta 0 \
--with-counts \
--label "${SPECIMEN}" \
--meta "species=${species}" \
"${r1}" "${r2}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
row = [species, strain]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
+102
View File
@@ -0,0 +1,102 @@
#!/usr/bin/env bash
# Usage: index_one_presence.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Outputs:
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
r1="${READS_DIR}/reads_R1.fastq.gz"
r2="${READS_DIR}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "ERROR: reads not found in ${READS_DIR}" >&2
exit 1
fi
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" index \
--output "${INDEX_PATH}" \
--force \
--theta 0 \
--label "${SPECIMEN}" \
--meta "species=${species}" \
"${r1}" "${r2}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
row = [species, strain]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
+118
View File
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
Like C .d files: only target: prerequisites lines, no recipes.
Recipes stay in the Makefile as generic rules.
"""
import gzip
import re
import sys
from pathlib import Path
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
'endosymbiont', 'of'}
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
def is_stop(tok):
t = tok.lower()
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
def sanitize(s):
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
def collect_tokens(text):
parts = []
for tok in text.split():
tok = tok.rstrip(',.')
if is_stop(tok):
break
parts.append(sanitize(tok))
return '_'.join(filter(None, parts))
def parse_organism(defn, gcf_id):
words = defn.split()
species = sanitize(words[0] + '_' + words[1])
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
if m:
strain = sanitize(m.group(1))
if m.group(2):
strain += '_' + sanitize(m.group(2))
return species, strain
m = re.search(r'\bstrain\b\s+(.*)', defn)
if m:
strain = collect_tokens(m.group(1))
if strain:
return species, strain
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
strain = collect_tokens(remainder)
return species, strain if strain else gcf_id
def first_definition(path):
with gzip.open(path, 'rt') as fh:
for line in fh:
if line.startswith('>'):
m = re.search(r'"definition":"([^"]*)"', line)
return m.group(1) if m else line[1:].split()[0]
return Path(path).stem
def main():
entries = [] # (specimen, species, sim_dir, genome_path)
species_seen = []
for path in sorted(sys.argv[1:]):
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
defn = first_definition(path)
sp, st = parse_organism(defn, gcf_id)
specimen = f'{sp}--{st}'
sim_dir = f'simulated_data/{sp}/{st}'
entries.append((specimen, sp, sim_dir, path))
if sp not in species_seen:
species_seen.append(sp)
specimens = [e[0] for e in entries]
print('SPECIMENS :=', ' '.join(specimens))
print('SPECIES :=', ' '.join(species_seen))
for specimen, species, sim_dir, genome in entries:
reads = f'{sim_dir}/reads_R1.fastq.gz'
p_done = f'specimen_index_presence/{specimen}/index.done'
p_stats = f'stats/indexing_presence/{specimen}.stats'
c_done = f'specimen_index_count/{specimen}/index.done'
c_stats = f'stats/indexing_count/{specimen}.stats'
ref = f'reference_index/{specimen}.npz'
vp = f'stats/verify_presence/{specimen}.stats'
vc = f'stats/verify_count/{specimen}.stats'
print()
print(f'# {specimen}')
print(f'{reads}: {genome}')
print(f'{ref}: {reads}')
print(f'{p_done} {p_stats}: {reads}')
print(f'{c_done} {c_stats}: {reads}')
print(f'{vp}: {ref} {p_done}')
print(f'{vc}: {ref} {c_done}')
print()
for sp in species_seen:
sp_done = f'specific_index_presence/{sp}/index.done'
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
sc_done = f'specific_index_count/{sp}/index.done'
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
print(f'# {sp}')
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
if __name__ == '__main__':
main()
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
OUTPUT="${SCRIPT_DIR}/global_index_count"
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
mkdir -p "${STATS_DIR}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
parse_reporter() {
local run="$1" n_sources="$2" logfile="$3"
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
import sys, re
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s):
state = 'rows'
elif state == 'rows':
if is_sep(s):
state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
row = [run, n_sources]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
}
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
if [[ ${#sources[@]} -eq 0 ]]; then
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
exit 1
fi
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
printf ' %s\n' "${sources[@]}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" merge \
--output "${OUTPUT}" \
--force \
"${sources[@]}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
echo "Done. Run ${run_n}${CSV}"
+104
View File
@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
OUTPUT="${SCRIPT_DIR}/global_index_presence"
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
mkdir -p "${STATS_DIR}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
parse_reporter() {
local run="$1" n_sources="$2" logfile="$3"
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
import sys, re
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s):
state = 'rows'
elif state == 'rows':
if is_sep(s):
state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
row = [run, n_sources]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
}
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
if [[ ${#sources[@]} -eq 0 ]]; then
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
exit 1
fi
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
printf ' %s\n' "${sources[@]}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" merge \
--output "${OUTPUT}" \
--force \
--force-presence \
"${sources[@]}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
echo "Done. Run ${run_n}${CSV}"
+12
View File
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Simulate all genomes. Delegates to simulate_one.sh per genome.
# Prefer running via `gmake simulate` which handles individual dependencies.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
--dir-for "${genome_file}")
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
done
+33
View File
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
# Usage: simulate_one.sh genome.fna.gz output_dir
# Simulates paired-end HiSeq reads for a single genome.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
COVERAGE=15
READ_LENGTH=150
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
genome_file="$1"
out_dir="$2"
mkdir -p "${out_dir}"
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
trap 'rm -f "${tmp_fasta}"' EXIT
gzip -dc "${genome_file}" > "${tmp_fasta}"
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
"${ISS}" generate \
--genomes "${tmp_fasta}" \
--model HiSeq \
--n_reads "${n_reads}" \
--cpus "${CPUS}" \
--compress \
--output "${out_dir}/reads"
+181
View File
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""Compare an obikmer count index against a reference kmer set (presence + counts).
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
streams `obikmer dump` from a --with-counts index, then reports:
- false negatives : kmers in reference absent from the index
- false positives : kmers in the index absent from the reference
- count mismatches: kmers present in both but with differing counts
Output to stdout: one CSV row
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
fn_pct,fp_pct,cm_pct
"""
import argparse
import subprocess
import sys
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── dump parsing ──────────────────────────────────────────────────────────────
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
kmers, counts = [], []
header = True
for line in proc.stdout:
if header:
header = False
continue
parts = line.rstrip('\n').split(',')
kmers.append(encode_kmer(parts[0]))
counts.append(int(parts[1]))
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
return (np.array(kmers, dtype=np.uint64)[order],
np.array(counts, dtype=np.uint32)[order])
# ── comparison ────────────────────────────────────────────────────────────────
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
idx_kmers: np.ndarray, idx_counts: np.ndarray,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
All arrays sorted; cm_* cover kmers present in both arrays but with
differing counts.
"""
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
# Count mismatches among shared kmers.
# Both arrays are sorted so we can use searchsorted.
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
shared_ref_counts = ref_counts[shared_mask]
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
mismatch_mask = shared_ref_counts != shared_idx_counts
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
cm_ref_counts = shared_ref_counts[mismatch_mask]
cm_idx_counts = shared_idx_counts[mismatch_mask]
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
help='Reference .npz file')
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='obikmer index directory (built with --with-counts)')
ap.add_argument('--obikmer', default='obikmer',
help='Path to obikmer binary')
ap.add_argument('--species', default='')
ap.add_argument('--strain', default='')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fp', metavar='FILE',
help='Save false-positive kmer strings to FILE')
ap.add_argument('--save-fn', metavar='FILE',
help='Save false-negative kmer strings to FILE')
ap.add_argument('--save-cm', metavar='FILE',
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,count_mismatch,'
'fn_pct,fp_pct,cm_pct')
return
# Detect k
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
# Load reference
print(f'Loading reference: {args.reference}', file=sys.stderr)
npz = np.load(args.reference)
ref_kmers = npz['kmers'] # sorted uint64
ref_counts = npz['counts'] # uint32
# Load index
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
ref_kmers, ref_counts, idx_kmers, idx_counts)
n_shared = len(ref_kmers) - len(false_neg)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
file=sys.stderr)
if args.save_fn and len(false_neg):
with open(args.save_fn, 'w') as fh:
for v in false_neg:
fh.write(decode_kmer(int(v), k) + '\n')
if args.save_fp and len(false_pos):
with open(args.save_fp, 'w') as fh:
for v in false_pos:
fh.write(decode_kmer(int(v), k) + '\n')
if args.save_cm and len(cm_kmers):
with open(args.save_cm, 'w') as fh:
fh.write('kmer,ref_count,idx_count\n')
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
print(f'{args.species},{args.strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
if __name__ == '__main__':
main()
+201
View File
@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""Verify the merged count index against all per-specimen reference sets.
Streams `obikmer dump` once on the merged index, accumulates per-specimen
kmer+count pairs from each column, then compares each against its reference .npz.
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
fn_pct,fp_pct,cm_pct
"""
import argparse
import subprocess
import sys
from pathlib import Path
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── single-pass dump ──────────────────────────────────────────────────────────
def stream_merged_dump(obikmer_bin: str, index_dir: str,
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
"""Stream the merged dump once.
Returns:
specimen_names : column labels in dump order
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
"""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
header_line = proc.stdout.readline().rstrip('\n')
cols = header_line.split(',')
specimen_names = cols[1:]
per_specimen: dict[str, tuple[list[int], list[int]]] = {
name: ([], []) for name in specimen_names}
for line in proc.stdout:
parts = line.rstrip('\n').split(',')
kmer_int = encode_kmer(parts[0])
for i, name in enumerate(specimen_names):
count = int(parts[i + 1])
if count > 0:
per_specimen[name][0].append(kmer_int)
per_specimen[name][1].append(count)
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
return specimen_names, per_specimen
# ── per-specimen comparison ───────────────────────────────────────────────────
def compare_specimen(name: str,
kmer_list: list[int],
count_list: list[int],
ref_dir: Path,
k: int,
save_fn: Path | None,
save_fp: Path | None,
save_cm: Path | None,
) -> str:
ref_path = ref_dir / f'{name}.npz'
if not ref_path.exists():
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
return ''
species = name.split('--')[0]
strain = name[len(species) + 2:]
npz = np.load(ref_path)
ref_kmers = npz['kmers'] # sorted uint64
ref_counts = npz['counts'] # uint32
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
idx_counts = np.array(count_list, dtype=np.uint32)[order]
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
# Count mismatches among shared kmers
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
cm_ref = ref_counts[shared_mask][mismatch_mask]
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
n_shared = int(shared_mask.sum())
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
file=sys.stderr)
if save_fn and len(false_neg):
fn_file = save_fn / f'{name}_fn.txt'
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
if save_fp and len(false_pos):
fp_file = save_fp / f'{name}_fp.txt'
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
if save_cm and len(cm_kmers):
cm_file = save_cm / f'{name}_cm.csv'
lines = ['kmer,ref_count,idx_count']
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
cm_file.write_text('\n'.join(lines) + '\n')
return (f'{species},{strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='Merged count index directory')
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
help='Directory containing per-specimen .npz reference files')
ap.add_argument('--obikmer', default='obikmer')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fn', metavar='DIR',
help='Directory for false-negative kmer lists')
ap.add_argument('--save-fp', metavar='DIR',
help='Directory for false-positive kmer lists')
ap.add_argument('--save-cm', metavar='DIR',
help='Directory for count-mismatch CSV files')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,count_mismatch,'
'fn_pct,fp_pct,cm_pct')
return
ref_dir = Path(args.ref_dir)
save_fn = Path(args.save_fn) if args.save_fn else None
save_fp = Path(args.save_fp) if args.save_fp else None
save_cm = Path(args.save_cm) if args.save_cm else None
for d in (save_fn, save_fp, save_cm):
if d: d.mkdir(parents=True, exist_ok=True)
out1 = subprocess.check_output(
[args.obikmer, 'dump', '--head', '1', args.index],
stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
for name in specimen_names:
kmers, counts = per_specimen[name]
row = compare_specimen(name, kmers, counts, ref_dir, k,
save_fn, save_fp, save_cm)
if row:
print(row)
if __name__ == '__main__':
main()
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
INDEX="${SCRIPT_DIR}/global_index_count"
REF_DIR="${SCRIPT_DIR}/reference_index"
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
mkdir -p "${STATS_DIR}"
CURRENT="${STATS_DIR}/current.csv"
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
"${INDEX}" "${REF_DIR}" \
>>"${CURRENT}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
cp "${CURRENT}" "${ARCHIVE}"
echo "Done. Results → ${ARCHIVE}"
+170
View File
@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""Verify the merged presence index against all per-specimen reference sets.
Streams `obikmer dump` once on the merged index, accumulates per-specimen
kmer sets from each column, then compares each against its reference .npz.
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
"""
import argparse
import subprocess
import sys
from pathlib import Path
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── single-pass dump ──────────────────────────────────────────────────────────
def stream_merged_dump(obikmer_bin: str, index_dir: str,
) -> tuple[list[str], dict[str, list[int]]]:
"""Stream the merged dump once.
Returns:
specimen_names : column labels in dump order (excluding 'kmer')
per_specimen : mapping label → list of kmer ints where presence > 0
"""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
header_line = proc.stdout.readline().rstrip('\n')
cols = header_line.split(',')
specimen_names = cols[1:] # first col is 'kmer'
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
for line in proc.stdout:
parts = line.rstrip('\n').split(',')
kmer_int = encode_kmer(parts[0])
for i, name in enumerate(specimen_names):
if int(parts[i + 1]) > 0:
per_specimen[name].append(kmer_int)
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
return specimen_names, per_specimen
# ── per-specimen comparison ───────────────────────────────────────────────────
def compare_specimen(name: str,
kmer_list: list[int],
ref_dir: Path,
k: int,
save_fn: Path | None,
save_fp: Path | None,
) -> str:
"""Compare one specimen column against its reference .npz.
Returns a CSV row string.
"""
ref_path = ref_dir / f'{name}.npz'
if not ref_path.exists():
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
return ''
species = name.split('--')[0]
strain = name[len(species) + 2:]
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
file=sys.stderr)
if save_fn and len(false_neg):
fn_file = save_fn / f'{name}_fn.txt'
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
if save_fp and len(false_pos):
fp_file = save_fp / f'{name}_fp.txt'
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
return (f'{species},{strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},'
f'{fn_pct:.4f},{fp_pct:.4f}')
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='Merged presence index directory')
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
help='Directory containing per-specimen .npz reference files')
ap.add_argument('--obikmer', default='obikmer')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fn', metavar='DIR',
help='Directory to save false-negative kmer lists')
ap.add_argument('--save-fp', metavar='DIR',
help='Directory to save false-positive kmer lists')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,fn_pct,fp_pct')
return
ref_dir = Path(args.ref_dir)
save_fn = Path(args.save_fn) if args.save_fn else None
save_fp = Path(args.save_fp) if args.save_fp else None
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
# Detect k
out1 = subprocess.check_output(
[args.obikmer, 'dump', '--head', '1', args.index],
stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
for name in specimen_names:
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
if row:
print(row)
if __name__ == '__main__':
main()
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
INDEX="${SCRIPT_DIR}/global_index_presence"
REF_DIR="${SCRIPT_DIR}/reference_index"
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
mkdir -p "${STATS_DIR}"
CURRENT="${STATS_DIR}/current.csv"
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
"${INDEX}" "${REF_DIR}" \
>>"${CURRENT}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
cp "${CURRENT}" "${ARCHIVE}"
echo "Done. Results → ${ARCHIVE}"
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Usage: verify_one_count.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIMEN}] verifying count"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
--species "${species}" \
--strain "${strain}" \
"${REF_NPZ}" "${INDEX_DIR}" \
>"${STATS_FILE}"
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Usage: verify_one_presence.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIMEN}] verifying presence"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
--species "${species}" \
--strain "${strain}" \
"${REF_NPZ}" "${INDEX_DIR}" \
>"${STATS_FILE}"
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""Compare an obikmer index against a reference kmer set (presence/absence).
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
streams the output of `obikmer dump`, encodes each kmer string to uint64,
then reports false negatives and false positives using numpy set operations.
Output to stdout: one CSV row
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
"""
import argparse
import subprocess
import sys
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── dump parsing ──────────────────────────────────────────────────────────────
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
kmers = []
header = True
for line in proc.stdout:
if header:
header = False
continue
kmer_str = line.split(',', 1)[0]
kmers.append(encode_kmer(kmer_str))
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
arr = np.array(kmers, dtype=np.uint64)
arr.sort()
return arr
# ── comparison ────────────────────────────────────────────────────────────────
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Return (false_negatives, false_positives) as uint64 arrays."""
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
return false_neg, false_pos
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
ap.add_argument('--species', default='', help='Species label for CSV row')
ap.add_argument('--strain', default='', help='Strain label for CSV row')
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
ap.add_argument('--save-fp', metavar='FILE',
help='Save false-positive kmer strings to FILE')
ap.add_argument('--save-fn', metavar='FILE',
help='Save false-negative kmer strings to FILE')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,fn_pct,fp_pct')
return
# Detect k from the index (one cheap call before the full dump).
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
# Load reference
print(f'Loading reference: {args.reference}', file=sys.stderr)
npz = np.load(args.reference)
ref_kmers = npz['kmers'] # already sorted uint64
# Load index
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
idx_kmers = load_index_kmers(args.obikmer, args.index)
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
false_neg, false_pos = compare(ref_kmers, idx_kmers)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
if args.save_fn and len(false_neg):
with open(args.save_fn, 'w') as fh:
for v in false_neg:
fh.write(decode_kmer(int(v), k) + '\n')
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
if args.save_fp and len(false_pos):
with open(args.save_fp, 'w') as fh:
for v in false_pos:
fh.write(decode_kmer(int(v), k) + '\n')
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
print(f'{args.species},{args.strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},'
f'{fn_pct:.4f},{fp_pct:.4f}')
if __name__ == '__main__':
main()
+8 -7
View File
@@ -29,16 +29,17 @@ Multiple values separated by `|` are always OR-ed within the predicate.
### Path matching (`~` and `!~`)
Metadata values can represent hierarchical taxonomic paths such as
Metadata values can represent hierarchical concept paths such as
`/Eukaryota/Viridiplantae/Streptophyta/Betulaceae/Betula/nana`.
- **Absolute pattern** (starts with `/`): the value must start with the pattern
at a segment boundary.
`taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
**Both the stored metadata value and the pattern must start with `/`.**
A pattern that does not start with `/` is rejected at parse time with an error.
The value matches the pattern if it equals it exactly or starts with the pattern
followed by `/` (segment-boundary prefix):
- `taxon~/Betulaceae/Betula` matches `/Betulaceae/Betula/nana` and
`/Betulaceae/Betula` but not `/Betulaceae/Betuloides/…`.
- **Bare segment** (no leading `/`): the value must contain the pattern as an
exact path component anywhere.
`taxon~Betula` matches any path that has `Betula` as one of its segments.
### Missing metadata key → NA
+1
View File
@@ -53,6 +53,7 @@ nav:
- Merge parallelism & memory: implementation/merge_parallelism.md
- Kmer filtering: implementation/filtering.md
- Select command: implementation/select.md
- obitaxonomy crate: implementation/obitaxonomy.md
- Architecture:
- Sequences: architecture/sequences/invariant.md
- Kmer index: architecture/index_architecture.md
+4
View File
@@ -1853,6 +1853,10 @@ dependencies = [
"tracing",
]
[[package]]
name = "obitaxonomy"
version = "0.1.0"
[[package]]
name = "object"
version = "0.37.3"
+1 -1
View File
@@ -1,5 +1,5 @@
[workspace]
resolver = "3"
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex"]
members = ["obikseq", "obiread", "obiskbuilder", "obifastwrite", "obikmer","obikrope","obipipeline", "obikpartitionner","obiskio","obidebruinj","obilayeredmap", "obicompactvec", "obisys", "obikindex", "obitaxonomy"]
[profile.release]
debug = 1
+4 -4
View File
@@ -88,9 +88,9 @@ impl<'a> IntoIterator for &'a PersistentBitVec {
// ── BitIter ───────────────────────────────────────────────────────────────────
pub struct BitIter<'a> {
pub(crate) words: &'a [u64],
pub(crate) slot: usize,
pub(crate) n: usize,
words: &'a [u64],
slot: usize,
n: usize,
}
impl ExactSizeIterator for BitIter<'_> {}
@@ -132,7 +132,7 @@ impl PersistentBitVecBuilder {
Ok(Self { mmap, n, path: path.to_path_buf() })
}
pub(crate) fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
pub fn from_raw_bytes(bytes: &[u8], n: usize, path: &Path) -> io::Result<Self> {
let file_size = HEADER_SIZE + n_bytes_for_words(n);
let file = OpenOptions::new()
.read(true).write(true).create(true).truncate(true)
+4 -4
View File
@@ -18,11 +18,11 @@ pub use builder::PersistentCompactIntVecBuilder;
pub use colgroup::{ColGroup, FilterMask, MatrixGroupOps, eval_filter_mask};
pub use intmatrix::{PersistentCompactIntMatrix, PersistentCompactIntMatrixBuilder, pack_compact_int_matrix};
pub use layer_meta::LayerMeta;
pub use reader::PersistentCompactIntVec;
pub use tempbitvec::TempBitVec;
pub use tempintvec::TempCompactIntVec;
pub use reader::{PersistentCompactIntVec, Iter as CompactIntVecIter};
pub use tempbitvec::{TempBitVec, TempBitVecBuilder};
pub use tempintvec::{TempCompactIntVec, TempCompactIntVecBuilder};
pub use traits::{BitPartials, ColumnWeights, CountPartials};
pub use views::{BitSliceView, IntSliceView};
pub use views::{BitSliceView, BitSliceIter, IntSliceView, IntSliceViewIter};
#[cfg(test)]
#[path = "tests/mod.rs"]
+12 -11
View File
@@ -43,27 +43,27 @@ impl TempBitVec {
// ── TempBitVecBuilder — mutable, becomes TempBitVec on freeze ────────────────
pub(crate) struct TempBitVecBuilder {
pub struct TempBitVecBuilder {
builder: PersistentBitVecBuilder,
temp: TempDir,
}
impl TempBitVecBuilder {
pub(crate) fn new(n: usize) -> io::Result<Self> {
pub fn new(n: usize) -> io::Result<Self> {
let temp = TempDir::new()?;
let path = temp.path().join("data.pbiv");
let builder = PersistentBitVecBuilder::new(n, &path)?;
Ok(Self { builder, temp })
}
pub(crate) fn new_ones(n: usize) -> io::Result<Self> {
pub fn new_ones(n: usize) -> io::Result<Self> {
let temp = TempDir::new()?;
let path = temp.path().join("data.pbiv");
let builder = PersistentBitVecBuilder::new_ones(n, &path)?;
Ok(Self { builder, temp })
}
pub(crate) fn freeze(self) -> io::Result<TempBitVec> {
pub fn freeze(self) -> io::Result<TempBitVec> {
let Self { builder, temp } = self;
let vec = builder.finish()?;
Ok(TempBitVec { vec, _temp: temp })
@@ -72,7 +72,8 @@ impl TempBitVecBuilder {
pub fn set(&mut self, slot: usize, value: bool) {
self.builder.set(slot, value);
}
pub(crate) fn view(&self) -> BitSliceView<'_> {
pub fn view(&self) -> BitSliceView<'_> {
self.builder.view()
}
@@ -80,19 +81,19 @@ impl TempBitVecBuilder {
self.builder.or(other);
}
pub(crate) fn and(&mut self, other: BitSliceView<'_>) {
pub fn and(&mut self, other: BitSliceView<'_>) {
self.builder.and(other);
}
pub(crate) fn xor(&mut self, other: BitSliceView<'_>) {
pub fn xor(&mut self, other: BitSliceView<'_>) {
self.builder.xor(other);
}
pub(crate) fn not(&mut self) {
pub fn not(&mut self) {
self.builder.not();
}
pub(crate) fn copy_from(&mut self, src: BitSliceView<'_>) {
pub fn copy_from(&mut self, src: BitSliceView<'_>) {
self.builder.copy_from(src);
}
@@ -100,11 +101,11 @@ impl TempBitVecBuilder {
self.builder.or_where(col, pred);
}
pub(crate) fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
pub fn and_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
self.builder.and_where(col, pred);
}
pub(crate) fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
pub fn xor_where(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
self.builder.xor_where(col, pred);
}
}
+17 -19
View File
@@ -32,60 +32,58 @@ impl TempCompactIntVec {
// ── TempCompactIntVecBuilder — mutable, becomes TempCompactIntVec on freeze ──
pub(crate) struct TempCompactIntVecBuilder {
pub struct TempCompactIntVecBuilder {
builder: PersistentCompactIntVecBuilder,
temp: TempDir,
}
impl TempCompactIntVecBuilder {
pub(crate) fn new(n: usize) -> io::Result<Self> {
pub fn new(n: usize) -> io::Result<Self> {
let temp = TempDir::new()?;
let path = temp.path().join("data.pciv");
let builder = PersistentCompactIntVecBuilder::new(n, &path)?;
Ok(Self { builder, temp })
}
pub(crate) fn freeze(self) -> io::Result<TempCompactIntVec> {
pub fn freeze(self) -> io::Result<TempCompactIntVec> {
let Self { builder, temp } = self;
let vec = builder.finish()?;
Ok(TempCompactIntVec { vec, _temp: temp })
}
// ── Delegation methods ────────────────────────────────────────────────────
pub fn n(&self) -> usize { self.builder.len() }
pub(crate) fn n(&self) -> usize { self.builder.len() }
pub fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
pub fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
pub(crate) fn set(&mut self, slot: usize, value: u32) { self.builder.set(slot, value); }
pub(crate) fn get(&self, slot: usize) -> u32 { self.builder.get(slot) }
pub fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
pub fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
pub(crate) fn primary_bytes(&self) -> &[u8] { self.builder.primary_bytes() }
pub(crate) fn primary_bytes_mut(&mut self) -> &mut [u8] { self.builder.primary_bytes_mut() }
pub(crate) fn inc_present(&mut self, col: BitSliceView<'_>) {
pub fn inc_present(&mut self, col: BitSliceView<'_>) {
self.builder.inc_present(col);
}
pub(crate) fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
pub fn inc_present_fast(&mut self, col: BitSliceView<'_>) {
self.builder.inc_present_fast(col);
}
pub(crate) fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
pub fn inc_predicate(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
self.builder.inc_predicate(col, pred);
}
pub(crate) fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
pub fn inc_predicate_fast(&mut self, col: IntSliceView<'_>, pred: impl Fn(u32) -> bool) {
self.builder.inc_predicate_fast(col, pred);
}
pub(crate) fn add(&mut self, other: IntSliceView<'_>) {
pub fn add(&mut self, other: IntSliceView<'_>) {
self.builder.add(other);
}
pub(crate) fn mask_with(&mut self, mask: BitSliceView<'_>) {
pub fn mask_with(&mut self, mask: BitSliceView<'_>) {
self.builder.mask_with(mask);
}
pub(crate) fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
pub(crate) fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
pub(crate) fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
pub fn min(&mut self, other: IntSliceView<'_>) { self.builder.min(other); }
pub fn max(&mut self, other: IntSliceView<'_>) { self.builder.max(other); }
pub fn diff(&mut self, other: IntSliceView<'_>) { self.builder.diff(other); }
}
+1
View File
@@ -3,6 +3,7 @@ use crossbeam_channel;
use hashbrown::HashMap;
use obikseq::k;
use obikseq::{CanonicalKmer, Sequence, Unitig};
#[cfg(not(any(test, feature = "test-utils")))]
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::cell::RefCell;
use std::fmt;
+3 -1
View File
@@ -11,7 +11,7 @@ use obilayeredmap::IndexMode;
use crate::error::{OKIError, OKIResult};
use crate::index::KmerIndex;
use crate::meta::{GenomeInfo, IndexMeta};
use crate::state::IndexState;
use crate::state::{IndexState, SENTINEL_INDEXED};
pub use obikpartitionner::MergeMode;
@@ -263,6 +263,8 @@ impl KmerIndex {
rep.push(t.stop());
}
fs::File::create(output.join(SENTINEL_INDEXED)).map_err(OKIError::Io)?;
KmerIndex::open(output)
}
}
+10 -9
View File
@@ -49,6 +49,11 @@ impl MetaPred {
if values.iter().any(|v| v.is_empty()) {
return Err(format!("empty value in predicate: {s}"));
}
if matches!(op, PredOp::Matches | PredOp::NotMatches) {
if let Some(v) = values.iter().find(|v| !v.starts_with('/')) {
return Err(format!("path predicate value must start with '/': {v:?} in predicate: {s}"));
}
}
Ok(Self { key, op, values })
}
@@ -72,16 +77,12 @@ impl MetaPred {
/// True if `value` is equal to `pattern` or is a descendant of it in a `/`-separated hierarchy.
///
/// - Absolute pattern (`/a/b`): `value` must start with `/a/b` at a segment boundary.
/// - Bare segment (`b`): `value` must contain `b` as an exact segment anywhere.
/// Both `value` and `pattern` must start with `/`.
/// `value` matches if it equals `pattern` exactly or starts with `pattern` followed by `/`.
fn path_matches(value: &str, pattern: &str) -> bool {
if pattern.starts_with('/') {
value == pattern
|| (value.starts_with(pattern)
&& value[pattern.len()..].starts_with('/'))
} else {
value.split('/').any(|seg| seg == pattern)
}
value == pattern
|| (value.starts_with(pattern)
&& value[pattern.len()..].starts_with('/'))
}
// ── Three-value group evaluation ──────────────────────────────────────────────
-28
View File
@@ -1,28 +0,0 @@
>F1FE4776BF3E1F06 {"seq_length":51,"kmer_size":31,"minimizer_size":11,"partition":229,"minimizer":"AAAAAAAATTA"}
GAGTATACTCATGTGAGGGTAAAAAAAATTAAGTCCCATATTGAAACATTA
>C14BF81526DD6CB7 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":84,"minimizer":"AAAAAAATTAA"}
AAAAAAATTAAGTCCCATATTGAAACATTAT
>9156D79605E4AC23 {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":87,"minimizer":"AAAAAATTAAG"}
AAAAAATTAAGTCCCATATTGAAACATTATC
>74666D1D78812D1E {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":118,"minimizer":"AAAAATTAAGT"}
AAAAATTAAGTCCCATATTGAAACATTATCA
>45EEFC3520FBDA9A {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":32,"minimizer":"AAAATTAAGTC"}
AAAATTAAGTCCCATATTGAAACATTATCAC
>5F44864B90170AF4 {"seq_length":49,"kmer_size":31,"minimizer_size":11,"partition":137,"minimizer":"AAACATTATCA"}
AAATTAAGTCCCATATTGAAACATTATCACAAATGTGAGTTGTTAATAT
>8D10A11C86F8EF26 {"seq_length":42,"kmer_size":31,"minimizer_size":11,"partition":26,"minimizer":"AAATGTGAGTT"}
AACATTATCACAAATGTGAGTTGTTAATATTACATAATTGGG
>C18F1086D0AF6E34 {"seq_length":32,"kmer_size":31,"minimizer_size":11,"partition":9,"minimizer":"TGTGAGTTGTT"}
AATGTGAGTTGTTAATATTACATAATTGGGTT
>933477394DAF03BB {"seq_length":31,"kmer_size":31,"minimizer_size":11,"partition":48,"minimizer":"TAATTGGGTTT"}
TGTGAGTTGTTAATATTACATAATTGGGTTT
>3CEE7E5227956042 {"seq_length":36,"kmer_size":31,"minimizer_size":11,"partition":252,"minimizer":"AATTGGGTTTT"}
GTGAGTTGTTAATATTACATAATTGGGTTTTATGCT
>1BAF5B8767D63D0B {"seq_length":33,"kmer_size":31,"minimizer_size":11,"partition":201,"minimizer":"AAAGGCTCCCT"}
TGAAAGGCTCCCTAGCGTGTTAATTAATCTCCC
>8368A897DB263C6F {"seq_length":38,"kmer_size":31,"minimizer_size":11,"partition":22,"minimizer":"CCTAGCGTGTT"}
AAGGCTCCCTAGCGTGTTAATTAATCTCCCTGACAAGT
>247DC82E11CF8055 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":128,"minimizer":"AATCTCCCTGA"}
CTAGCGTGTTAATTAATCTCCCTGACAAGTAGTGT
>11C93BBC8A5F6327 {"seq_length":35,"kmer_size":31,"minimizer_size":11,"partition":62,"minimizer":"CAAGTAGTGTT"}
GTGTTAATTAATCTCCCTGACAAGTAGTGTTAGTG