feat: add benchmark pipeline, expose APIs, and enforce strict paths

Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
Eric Coissac
2026-06-19 09:55:41 +02:00
parent 280ca1f5a3
commit c694e1f2b0
42 changed files with 2585 additions and 84 deletions
+144
View File
@@ -0,0 +1,144 @@
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
BINARY := ../src/target/release/obikmer
VENV_PY := ../.venv/bin/python3
GENOMES := $(wildcard genomes/*.fna.gz)
# SPECIMENS, SPECIES, and the full dependency graph are generated by
# make_deps.py from the genome FASTA headers — like .d files in C.
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
-include deps.mk
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
.NOTPARALLEL:
.PHONY: all simulate reference \
index_presence index_count \
aggregate_index_presence aggregate_index_count \
merge_presence merge_count \
verify_presence verify_count \
aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
filter_presence filter_count \
aggregate_filter_presence aggregate_filter_count
verify_merge_presence: stats/verify_merge_presence/current.csv
verify_merge_count: stats/verify_merge_count/current.csv
all: aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
aggregate_filter_presence aggregate_filter_count
# ── dependency file ───────────────────────────────────────────────────────────
deps.mk: $(GENOMES)
$(VENV_PY) make_deps.py $^ > $@
# ── simulation ────────────────────────────────────────────────────────────────
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
$(SIMULATED_READS):
bash simulate_one.sh $< $(dir $@)
simulate: $(SIMULATED_READS)
# ── reference kmer sets ───────────────────────────────────────────────────────
# Prerequisites (reads → npz) are in deps.mk.
reference_index/%.npz:
bash build_reference.sh $*
reference: $(REF_NPZS)
# ── per-specimen indexing ─────────────────────────────────────────────────────
# Prerequisites (reads → index.done + .stats) are in deps.mk.
specimen_index_presence/%/index.done \
stats/indexing_presence/%.stats &: $(BINARY)
bash index_one_presence.sh $*
specimen_index_count/%/index.done \
stats/indexing_count/%.stats &: $(BINARY)
bash index_one_count.sh $*
index_presence: $(PRESENCE_DONE)
index_count: $(COUNT_DONE)
# ── indexing stats aggregation ────────────────────────────────────────────────
aggregate_index_presence: $(PRESENCE_STATS)
bash aggregate_stats.sh indexing_presence
aggregate_index_count: $(COUNT_STATS)
bash aggregate_stats.sh indexing_count
# ── global merge ──────────────────────────────────────────────────────────────
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
bash merge_presence.sh
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
bash merge_count.sh
merge_presence: global_index_presence/index.done
merge_count: global_index_count/index.done
# ── per-specimen verification ─────────────────────────────────────────────────
# Prerequisites (index.done + npz → .stats) are in deps.mk.
stats/verify_presence/%.stats:
bash verify_one_presence.sh $*
stats/verify_count/%.stats:
bash verify_one_count.sh $*
verify_presence: $(VERIFY_PRESENCE_STATS)
verify_count: $(VERIFY_COUNT_STATS)
# ── verification stats aggregation ───────────────────────────────────────────
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
bash aggregate_stats.sh verify_presence
aggregate_verify_count: $(VERIFY_COUNT_STATS)
bash aggregate_stats.sh verify_count
# ── species-specific indexes ──────────────────────────────────────────────────
# Prerequisites (global index → specific index) are in deps.mk.
specific_index_presence/%/index.done \
stats/specific_kmer_presence/%.stats &: $(BINARY)
bash filter_one_presence.sh $*
specific_index_count/%/index.done \
stats/specific_kmer_count/%.stats &: $(BINARY)
bash filter_one_count.sh $*
filter_presence: $(SPECIFIC_PRESENCE_DONE)
filter_count: $(SPECIFIC_COUNT_DONE)
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
bash aggregate_stats.sh specific_kmer_presence
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
bash aggregate_stats.sh specific_kmer_count
# ── merged index verification ─────────────────────────────────────────────────
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
bash verify_merge_presence.sh
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
bash verify_merge_count.sh
+132
View File
@@ -0,0 +1,132 @@
# Benchmark pipeline
Requires **GNU Make ≥ 4.3** (grouped targets `&:`). On macOS use `gmake`.
```
gmake all # full pipeline
gmake simulate # simulation only
gmake reference # reference kmer sets only
```
## Pipeline overview
```mermaid
flowchart TD
GENOMES["genomes/*.fna.gz"]
BIN["obikmer binary"]
GENOMES --> simulate
simulate --> simdata[("simulated_data/")]
simdata --> reference
reference --> refnpz[("reference_index/*.npz")]
subgraph presence ["Presence track"]
simdata --> index_presence
BIN --> index_presence
index_presence --> pres_done[("specimen_index_presence/")]
index_presence --> pres_istats[("stats/indexing_presence/")]
pres_istats --> aggregate_index_presence
pres_done --> merge_presence
BIN --> merge_presence
merge_presence --> gpres[("global_index_presence/")]
refnpz --> verify_presence
pres_done --> verify_presence
verify_presence --> vpres_stats[("stats/verify_presence/")]
vpres_stats --> aggregate_verify_presence
gpres --> filter_presence
BIN --> filter_presence
filter_presence --> spec_pres[("specific_index_presence/")]
filter_presence --> spec_pres_stats[("stats/specific_kmer_presence/")]
spec_pres_stats --> aggregate_filter_presence
refnpz --> verify_merge_presence
gpres --> verify_merge_presence
verify_merge_presence --> vmp[("stats/verify_merge_presence/")]
end
subgraph count ["Count track"]
simdata --> index_count
BIN --> index_count
index_count --> count_done[("specimen_index_count/")]
index_count --> count_istats[("stats/indexing_count/")]
count_istats --> aggregate_index_count
count_done --> merge_count
BIN --> merge_count
merge_count --> gcount[("global_index_count/")]
refnpz --> verify_count
count_done --> verify_count
verify_count --> vcount_stats[("stats/verify_count/")]
vcount_stats --> aggregate_verify_count
gcount --> filter_count
BIN --> filter_count
filter_count --> spec_count[("specific_index_count/")]
filter_count --> spec_count_stats[("stats/specific_kmer_count/")]
spec_count_stats --> aggregate_filter_count
refnpz --> verify_merge_count
gcount --> verify_merge_count
verify_merge_count --> vmc[("stats/verify_merge_count/")]
end
aggregate_verify_presence --> all
aggregate_verify_count --> all
vmp --> all
vmc --> all
all -. "$(MAKE) re-eval" .-> aggregate_filter_presence
all -. "$(MAKE) re-eval" .-> aggregate_filter_count
```
## Steps
| Target | Script | Description |
|---|---|---|
| `simulate` | `simulate.sh` | Simulate sequencing reads from the reference genomes |
| `reference` | `build_reference.sh` | Build reference kmer sets (`.npz`) from simulation truth |
| `index_presence` | `index_one_presence.sh` | Index each specimen (presence mode) |
| `index_count` | `index_one_count.sh` | Index each specimen (count mode) |
| `aggregate_index_presence` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (presence) |
| `aggregate_index_count` | `aggregate_stats.sh` | Aggregate per-specimen indexing stats (count) |
| `merge_presence` | `merge_presence.sh` | Merge all specimen presence indexes into a global index |
| `merge_count` | `merge_count.sh` | Merge all specimen count indexes into a global index |
| `verify_presence` | `verify_one_presence.sh` | Verify each specimen presence index against reference |
| `verify_count` | `verify_one_count.sh` | Verify each specimen count index against reference |
| `aggregate_verify_presence` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (presence) |
| `aggregate_verify_count` | `aggregate_stats.sh` | Aggregate per-specimen verification stats (count) |
| `filter_presence` | `filter_one_presence.sh` | Extract species-specific presence indexes from global index |
| `filter_count` | `filter_one_count.sh` | Extract species-specific count indexes from global index |
| `aggregate_filter_presence` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (presence) |
| `aggregate_filter_count` | `aggregate_stats.sh` | Aggregate species-specific kmer stats (count) |
| `verify_merge_presence` | `verify_merge_presence.sh` | Verify global presence index against all reference sets |
| `verify_merge_count` | `verify_merge_count.sh` | Verify global count index against all reference sets |
## Directory layout
```
benchmark/
├── genomes/ # input reference genomes (.fna.gz)
├── simulated_data/ # generated by simulate
│ └── <species>/<specimen>/
├── reference_index/ # reference kmer sets (.npz)
├── specimen_index_presence/ # per-specimen presence indexes
├── specimen_index_count/ # per-specimen count indexes
├── global_index_presence/ # merged global presence index
├── global_index_count/ # merged global count index
├── specific_index_presence/ # species-specific presence indexes
├── specific_index_count/ # species-specific count indexes
└── stats/ # all benchmark statistics
├── indexing_presence/
├── indexing_count/
├── verify_presence/
├── verify_count/
├── specific_kmer_presence/
├── specific_kmer_count/
├── verify_merge_presence/
└── verify_merge_count/
```
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env bash
# Usage: aggregate_stats.sh TYPE
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
#
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
# the most recent run CSV (idempotent when nothing changed).
set -euo pipefail
TYPE="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
case "${TYPE}" in
indexing_presence|indexing_count)
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
;;
verify_presence)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
;;
verify_count)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
;;
specific_kmer_presence|specific_kmer_count)
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
;;
*)
echo "ERROR: unknown stats type '${TYPE}'" >&2
exit 1
;;
esac
# Find most recent existing run CSV (empty string if none).
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
# Check if any .stats file is newer than the latest run CSV.
if [[ -n "${latest_csv}" ]] && \
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
echo "[${TYPE}] stats up to date (${latest_csv})"
exit 0
fi
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
echo "${HEADER}" >"${CSV}"
# Sort .stats files by name for reproducible row order.
while IFS= read -r stats_file; do
sed "s/^/${run_n},/" "${stats_file}"
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
echo "[${TYPE}] run ${run_n}${CSV}"
+137
View File
@@ -0,0 +1,137 @@
#!/usr/bin/env python3
"""Build a reference kmer index from paired-end FASTQ reads.
Extracts canonical kmers — min(kmer, revcomp(kmer)) encoded as uint64 —
counts their abundances, and saves a sorted numpy pair (kmers, counts).
Output .npz arrays
kmers : uint64, sorted ascending — canonical kmer integers
counts : uint32, same order — raw read abundances
"""
import argparse
import gzip
import sys
from collections import defaultdict
import numpy as np
# ── encoding ────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
# Lookup table: revcomp of one byte (4 bases, 8 bits).
# Precomputed once at import time.
_REVCOMP8 = [0] * 256
for _i in range(256):
_rc, _x = 0, _i
for _ in range(4):
_rc = (_rc << 2) | (3 - (_x & 3))
_x >>= 2
_REVCOMP8[_i] = _rc
del _i, _rc, _x
def revcomp_int(kmer: int, k: int) -> int:
"""Reverse-complement of a kmer encoded as an integer (2 bits/base).
Uses byte-level lookup (4 bases at a time) for speed.
"""
rc = 0
bits_left = 2 * k
while bits_left > 0:
chunk = min(8, bits_left)
rc_byte = _REVCOMP8[kmer & 0xFF] >> (8 - chunk)
rc = (rc << chunk) | rc_byte
kmer >>= chunk
bits_left -= chunk
return rc
# ── FASTQ parsing ────────────────────────────────────────────────────────────
def iter_sequences(path: str):
"""Yield raw sequences from a (gzipped) FASTQ file."""
opener = gzip.open if path.endswith('.gz') else open
with opener(path, 'rt') as fh:
while True:
if not fh.readline(): # '@' header
break
seq = fh.readline().rstrip('\n')
fh.readline() # '+'
fh.readline() # quality
yield seq
# ── kmer counting ────────────────────────────────────────────────────────────
def count_kmers(paths: list[str], k: int) -> dict[int, int]:
mask = (1 << (2 * k)) - 1
counts: dict[int, int] = defaultdict(int)
n_reads = 0
for path in paths:
for seq in iter_sequences(path):
n_reads += 1
kmer = 0
run = 0 # consecutive valid bases
for c in seq:
b = _ENCODE.get(c)
if b is None: # N or unexpected character → reset
kmer = 0
run = 0
continue
kmer = ((kmer << 2) | b) & mask
run += 1
if run >= k:
rc = revcomp_int(kmer, k)
counts[kmer if kmer <= rc else rc] += 1
if n_reads % 100_000 == 0:
print(f' {n_reads:,} reads processed, '
f'{len(counts):,} distinct kmers so far',
file=sys.stderr)
print(f' {n_reads:,} reads total, {len(counts):,} distinct kmers',
file=sys.stderr)
return counts
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reads', nargs='+', metavar='FASTQ',
help='Input reads (FASTQ, gzip OK)')
ap.add_argument('-k', '--kmer-size', type=int, default=31,
metavar='K')
ap.add_argument('--min-abundance', type=int, default=1,
metavar='N', help='Drop kmers with count < N (default 1)')
ap.add_argument('-o', '--output', required=True,
metavar='FILE', help='Output .npz path')
args = ap.parse_args()
print(f'k={args.kmer_size} files={len(args.reads)}', file=sys.stderr)
counts = count_kmers(args.reads, args.kmer_size)
if args.min_abundance > 1:
before = len(counts)
counts = {k: v for k, v in counts.items() if v >= args.min_abundance}
print(f' min-abundance={args.min_abundance}: '
f'{before - len(counts):,} kmers dropped, '
f'{len(counts):,} retained',
file=sys.stderr)
print(f'Sorting and saving → {args.output}', file=sys.stderr)
kmers_arr = np.fromiter(sorted(counts), dtype=np.uint64, count=len(counts))
counts_arr = np.array([counts[int(k)] for k in kmers_arr], dtype=np.uint32)
np.savez_compressed(args.output, kmers=kmers_arr, counts=counts_arr)
print(f'Done {len(kmers_arr):,} kmers → {args.output}', file=sys.stderr)
if __name__ == '__main__':
main()
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
REF_DIR="${SCRIPT_DIR}/reference_index"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
KMER_SIZE="${KMER_SIZE:-31}"
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
mkdir -p "${REF_DIR}"
for species_dir in "${SIMDATA_DIR}"/*/; do
[[ -d "${species_dir}" ]] || continue
species=$(basename "${species_dir}")
for strain_dir in "${species_dir}"*/; do
[[ -d "${strain_dir}" ]] || continue
strain=$(basename "${strain_dir}")
r1="${strain_dir}/reads_R1.fastq.gz"
r2="${strain_dir}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "SKIP ${species}--${strain}: reads not found" >&2
continue
fi
out="${REF_DIR}/${species}--${strain}.npz"
echo "[${species}--${strain}] → ${out}"
"${PYTHON}" "${BUILD_PY}" \
--kmer-size "${KMER_SIZE}" \
--min-abundance "${MIN_ABUNDANCE}" \
--output "${out}" \
"${r1}" "${r2}"
done
done
+199
View File
@@ -0,0 +1,199 @@
SPECIMENS := Escherichia_coli--K-12_MG1655 Escherichia_coli--EDL933 Salmonella_enterica--LT2 Escherichia_coli--CFT073 Bacillus_subtilis--168 Salmonella_enterica--P125109 Shouchella_clausii--KSM-K16 Escherichia_coli--K-12_W3110 Klebsiella_pneumoniae--MGH_78578 Opitutus_terrae--PB90-1 Saccharolobus_islandicus--M.16.4 Acidobacterium_capsulatum--ATCC_51196 Salmonella_enterica--AKU_12601 Proteus_mirabilis--HI4320 Salmonella_enterica--CT18 Klebsiella_pneumoniae--HS11286 Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1 Klebsiella_pneumoniae--ATCC_13883 Yersinia_ruckeri--YRB Candidozyma_auris--GCF_003013715.1_ASM301371v2
SPECIES := Escherichia_coli Salmonella_enterica Bacillus_subtilis Shouchella_clausii Klebsiella_pneumoniae Opitutus_terrae Saccharolobus_islandicus Acidobacterium_capsulatum Proteus_mirabilis Wolbachia_endosymbiont Yersinia_ruckeri Candidozyma_auris
# Escherichia_coli--K-12_MG1655
simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz: genomes/GCF_000005845.2_ASM584v2_genomic.fna.gz
reference_index/Escherichia_coli--K-12_MG1655.npz: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done stats/indexing_presence/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--K-12_MG1655/index.done stats/indexing_count/Escherichia_coli--K-12_MG1655.stats: simulated_data/Escherichia_coli/K-12_MG1655/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_presence/Escherichia_coli--K-12_MG1655/index.done
stats/verify_count/Escherichia_coli--K-12_MG1655.stats: reference_index/Escherichia_coli--K-12_MG1655.npz specimen_index_count/Escherichia_coli--K-12_MG1655/index.done
# Escherichia_coli--EDL933
simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz: genomes/GCF_000006665.1_ASM666v1_genomic.fna.gz
reference_index/Escherichia_coli--EDL933.npz: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--EDL933/index.done stats/indexing_presence/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--EDL933/index.done stats/indexing_count/Escherichia_coli--EDL933.stats: simulated_data/Escherichia_coli/EDL933/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_presence/Escherichia_coli--EDL933/index.done
stats/verify_count/Escherichia_coli--EDL933.stats: reference_index/Escherichia_coli--EDL933.npz specimen_index_count/Escherichia_coli--EDL933/index.done
# Salmonella_enterica--LT2
simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz: genomes/GCF_000006945.2_ASM694v2_genomic.fna.gz
reference_index/Salmonella_enterica--LT2.npz: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--LT2/index.done stats/indexing_presence/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--LT2/index.done stats/indexing_count/Salmonella_enterica--LT2.stats: simulated_data/Salmonella_enterica/LT2/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_presence/Salmonella_enterica--LT2/index.done
stats/verify_count/Salmonella_enterica--LT2.stats: reference_index/Salmonella_enterica--LT2.npz specimen_index_count/Salmonella_enterica--LT2/index.done
# Escherichia_coli--CFT073
simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz: genomes/GCF_000007445.1_ASM744v1_genomic.fna.gz
reference_index/Escherichia_coli--CFT073.npz: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--CFT073/index.done stats/indexing_presence/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--CFT073/index.done stats/indexing_count/Escherichia_coli--CFT073.stats: simulated_data/Escherichia_coli/CFT073/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_presence/Escherichia_coli--CFT073/index.done
stats/verify_count/Escherichia_coli--CFT073.stats: reference_index/Escherichia_coli--CFT073.npz specimen_index_count/Escherichia_coli--CFT073/index.done
# Bacillus_subtilis--168
simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz: genomes/GCF_000009045.1_ASM904v1_genomic.fna.gz
reference_index/Bacillus_subtilis--168.npz: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
specimen_index_presence/Bacillus_subtilis--168/index.done stats/indexing_presence/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
specimen_index_count/Bacillus_subtilis--168/index.done stats/indexing_count/Bacillus_subtilis--168.stats: simulated_data/Bacillus_subtilis/168/reads_R1.fastq.gz
stats/verify_presence/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_presence/Bacillus_subtilis--168/index.done
stats/verify_count/Bacillus_subtilis--168.stats: reference_index/Bacillus_subtilis--168.npz specimen_index_count/Bacillus_subtilis--168/index.done
# Salmonella_enterica--P125109
simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz: genomes/GCF_000009505.1_ASM950v1_genomic.fna.gz
reference_index/Salmonella_enterica--P125109.npz: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--P125109/index.done stats/indexing_presence/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--P125109/index.done stats/indexing_count/Salmonella_enterica--P125109.stats: simulated_data/Salmonella_enterica/P125109/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_presence/Salmonella_enterica--P125109/index.done
stats/verify_count/Salmonella_enterica--P125109.stats: reference_index/Salmonella_enterica--P125109.npz specimen_index_count/Salmonella_enterica--P125109/index.done
# Shouchella_clausii--KSM-K16
simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz: genomes/GCF_000009825.1_ASM982v1_genomic.fna.gz
reference_index/Shouchella_clausii--KSM-K16.npz: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
specimen_index_presence/Shouchella_clausii--KSM-K16/index.done stats/indexing_presence/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
specimen_index_count/Shouchella_clausii--KSM-K16/index.done stats/indexing_count/Shouchella_clausii--KSM-K16.stats: simulated_data/Shouchella_clausii/KSM-K16/reads_R1.fastq.gz
stats/verify_presence/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_presence/Shouchella_clausii--KSM-K16/index.done
stats/verify_count/Shouchella_clausii--KSM-K16.stats: reference_index/Shouchella_clausii--KSM-K16.npz specimen_index_count/Shouchella_clausii--KSM-K16/index.done
# Escherichia_coli--K-12_W3110
simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz: genomes/GCF_000010245.2_ASM1024v1_genomic.fna.gz
reference_index/Escherichia_coli--K-12_W3110.npz: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
specimen_index_presence/Escherichia_coli--K-12_W3110/index.done stats/indexing_presence/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
specimen_index_count/Escherichia_coli--K-12_W3110/index.done stats/indexing_count/Escherichia_coli--K-12_W3110.stats: simulated_data/Escherichia_coli/K-12_W3110/reads_R1.fastq.gz
stats/verify_presence/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_presence/Escherichia_coli--K-12_W3110/index.done
stats/verify_count/Escherichia_coli--K-12_W3110.stats: reference_index/Escherichia_coli--K-12_W3110.npz specimen_index_count/Escherichia_coli--K-12_W3110/index.done
# Klebsiella_pneumoniae--MGH_78578
simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz: genomes/GCF_000016305.1_ASM1630v1_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--MGH_78578.npz: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_presence/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done stats/indexing_count/Klebsiella_pneumoniae--MGH_78578.stats: simulated_data/Klebsiella_pneumoniae/MGH_78578/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_presence/Klebsiella_pneumoniae--MGH_78578/index.done
stats/verify_count/Klebsiella_pneumoniae--MGH_78578.stats: reference_index/Klebsiella_pneumoniae--MGH_78578.npz specimen_index_count/Klebsiella_pneumoniae--MGH_78578/index.done
# Opitutus_terrae--PB90-1
simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz: genomes/GCF_000019965.1_ASM1996v1_genomic.fna.gz
reference_index/Opitutus_terrae--PB90-1.npz: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
specimen_index_presence/Opitutus_terrae--PB90-1/index.done stats/indexing_presence/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
specimen_index_count/Opitutus_terrae--PB90-1/index.done stats/indexing_count/Opitutus_terrae--PB90-1.stats: simulated_data/Opitutus_terrae/PB90-1/reads_R1.fastq.gz
stats/verify_presence/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_presence/Opitutus_terrae--PB90-1/index.done
stats/verify_count/Opitutus_terrae--PB90-1.stats: reference_index/Opitutus_terrae--PB90-1.npz specimen_index_count/Opitutus_terrae--PB90-1/index.done
# Saccharolobus_islandicus--M.16.4
simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz: genomes/GCF_000022445.1_ASM2244v1_genomic.fna.gz
reference_index/Saccharolobus_islandicus--M.16.4.npz: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_presence/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done stats/indexing_count/Saccharolobus_islandicus--M.16.4.stats: simulated_data/Saccharolobus_islandicus/M.16.4/reads_R1.fastq.gz
stats/verify_presence/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_presence/Saccharolobus_islandicus--M.16.4/index.done
stats/verify_count/Saccharolobus_islandicus--M.16.4.stats: reference_index/Saccharolobus_islandicus--M.16.4.npz specimen_index_count/Saccharolobus_islandicus--M.16.4/index.done
# Acidobacterium_capsulatum--ATCC_51196
simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz: genomes/GCF_000022565.1_ASM2256v1_genomic.fna.gz
reference_index/Acidobacterium_capsulatum--ATCC_51196.npz: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_presence/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done stats/indexing_count/Acidobacterium_capsulatum--ATCC_51196.stats: simulated_data/Acidobacterium_capsulatum/ATCC_51196/reads_R1.fastq.gz
stats/verify_presence/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_presence/Acidobacterium_capsulatum--ATCC_51196/index.done
stats/verify_count/Acidobacterium_capsulatum--ATCC_51196.stats: reference_index/Acidobacterium_capsulatum--ATCC_51196.npz specimen_index_count/Acidobacterium_capsulatum--ATCC_51196/index.done
# Salmonella_enterica--AKU_12601
simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz: genomes/GCF_000026565.1_ASM2656v1_genomic.fna.gz
reference_index/Salmonella_enterica--AKU_12601.npz: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--AKU_12601/index.done stats/indexing_presence/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--AKU_12601/index.done stats/indexing_count/Salmonella_enterica--AKU_12601.stats: simulated_data/Salmonella_enterica/AKU_12601/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_presence/Salmonella_enterica--AKU_12601/index.done
stats/verify_count/Salmonella_enterica--AKU_12601.stats: reference_index/Salmonella_enterica--AKU_12601.npz specimen_index_count/Salmonella_enterica--AKU_12601/index.done
# Proteus_mirabilis--HI4320
simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz: genomes/GCF_000069965.1_ASM6996v1_genomic.fna.gz
reference_index/Proteus_mirabilis--HI4320.npz: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
specimen_index_presence/Proteus_mirabilis--HI4320/index.done stats/indexing_presence/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
specimen_index_count/Proteus_mirabilis--HI4320/index.done stats/indexing_count/Proteus_mirabilis--HI4320.stats: simulated_data/Proteus_mirabilis/HI4320/reads_R1.fastq.gz
stats/verify_presence/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_presence/Proteus_mirabilis--HI4320/index.done
stats/verify_count/Proteus_mirabilis--HI4320.stats: reference_index/Proteus_mirabilis--HI4320.npz specimen_index_count/Proteus_mirabilis--HI4320/index.done
# Salmonella_enterica--CT18
simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz: genomes/GCF_000195995.1_ASM19599v1_genomic.fna.gz
reference_index/Salmonella_enterica--CT18.npz: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
specimen_index_presence/Salmonella_enterica--CT18/index.done stats/indexing_presence/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
specimen_index_count/Salmonella_enterica--CT18/index.done stats/indexing_count/Salmonella_enterica--CT18.stats: simulated_data/Salmonella_enterica/CT18/reads_R1.fastq.gz
stats/verify_presence/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_presence/Salmonella_enterica--CT18/index.done
stats/verify_count/Salmonella_enterica--CT18.stats: reference_index/Salmonella_enterica--CT18.npz specimen_index_count/Salmonella_enterica--CT18/index.done
# Klebsiella_pneumoniae--HS11286
simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz: genomes/GCF_000240185.1_ASM24018v2_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--HS11286.npz: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_presence/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done stats/indexing_count/Klebsiella_pneumoniae--HS11286.stats: simulated_data/Klebsiella_pneumoniae/HS11286/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_presence/Klebsiella_pneumoniae--HS11286/index.done
stats/verify_count/Klebsiella_pneumoniae--HS11286.stats: reference_index/Klebsiella_pneumoniae--HS11286.npz specimen_index_count/Klebsiella_pneumoniae--HS11286/index.done
# Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1
simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz: genomes/GCF_000306885.1_ASM30688v1_genomic.fna.gz
reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done stats/indexing_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: simulated_data/Wolbachia_endosymbiont/GCF_000306885.1_ASM30688v1/reads_R1.fastq.gz
stats/verify_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_presence/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
stats/verify_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.stats: reference_index/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1.npz specimen_index_count/Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1/index.done
# Klebsiella_pneumoniae--ATCC_13883
simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz: genomes/GCF_000742135.1_ASM74213v1_genomic.fna.gz
reference_index/Klebsiella_pneumoniae--ATCC_13883.npz: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_presence/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done stats/indexing_count/Klebsiella_pneumoniae--ATCC_13883.stats: simulated_data/Klebsiella_pneumoniae/ATCC_13883/reads_R1.fastq.gz
stats/verify_presence/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_presence/Klebsiella_pneumoniae--ATCC_13883/index.done
stats/verify_count/Klebsiella_pneumoniae--ATCC_13883.stats: reference_index/Klebsiella_pneumoniae--ATCC_13883.npz specimen_index_count/Klebsiella_pneumoniae--ATCC_13883/index.done
# Yersinia_ruckeri--YRB
simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz: genomes/GCF_000834255.1_ASM83425v1_genomic.fna.gz
reference_index/Yersinia_ruckeri--YRB.npz: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
specimen_index_presence/Yersinia_ruckeri--YRB/index.done stats/indexing_presence/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
specimen_index_count/Yersinia_ruckeri--YRB/index.done stats/indexing_count/Yersinia_ruckeri--YRB.stats: simulated_data/Yersinia_ruckeri/YRB/reads_R1.fastq.gz
stats/verify_presence/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_presence/Yersinia_ruckeri--YRB/index.done
stats/verify_count/Yersinia_ruckeri--YRB.stats: reference_index/Yersinia_ruckeri--YRB.npz specimen_index_count/Yersinia_ruckeri--YRB/index.done
# Candidozyma_auris--GCF_003013715.1_ASM301371v2
simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz: genomes/GCF_003013715.1_ASM301371v2_genomic.fna.gz
reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done stats/indexing_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: simulated_data/Candidozyma_auris/GCF_003013715.1_ASM301371v2/reads_R1.fastq.gz
stats/verify_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_presence/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
stats/verify_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2.stats: reference_index/Candidozyma_auris--GCF_003013715.1_ASM301371v2.npz specimen_index_count/Candidozyma_auris--GCF_003013715.1_ASM301371v2/index.done
# Escherichia_coli
specific_index_presence/Escherichia_coli/index.done stats/specific_kmer_presence/Escherichia_coli.stats: global_index_presence/index.done
specific_index_count/Escherichia_coli/index.done stats/specific_kmer_count/Escherichia_coli.stats: global_index_count/index.done
# Salmonella_enterica
specific_index_presence/Salmonella_enterica/index.done stats/specific_kmer_presence/Salmonella_enterica.stats: global_index_presence/index.done
specific_index_count/Salmonella_enterica/index.done stats/specific_kmer_count/Salmonella_enterica.stats: global_index_count/index.done
# Bacillus_subtilis
specific_index_presence/Bacillus_subtilis/index.done stats/specific_kmer_presence/Bacillus_subtilis.stats: global_index_presence/index.done
specific_index_count/Bacillus_subtilis/index.done stats/specific_kmer_count/Bacillus_subtilis.stats: global_index_count/index.done
# Shouchella_clausii
specific_index_presence/Shouchella_clausii/index.done stats/specific_kmer_presence/Shouchella_clausii.stats: global_index_presence/index.done
specific_index_count/Shouchella_clausii/index.done stats/specific_kmer_count/Shouchella_clausii.stats: global_index_count/index.done
# Klebsiella_pneumoniae
specific_index_presence/Klebsiella_pneumoniae/index.done stats/specific_kmer_presence/Klebsiella_pneumoniae.stats: global_index_presence/index.done
specific_index_count/Klebsiella_pneumoniae/index.done stats/specific_kmer_count/Klebsiella_pneumoniae.stats: global_index_count/index.done
# Opitutus_terrae
specific_index_presence/Opitutus_terrae/index.done stats/specific_kmer_presence/Opitutus_terrae.stats: global_index_presence/index.done
specific_index_count/Opitutus_terrae/index.done stats/specific_kmer_count/Opitutus_terrae.stats: global_index_count/index.done
# Saccharolobus_islandicus
specific_index_presence/Saccharolobus_islandicus/index.done stats/specific_kmer_presence/Saccharolobus_islandicus.stats: global_index_presence/index.done
specific_index_count/Saccharolobus_islandicus/index.done stats/specific_kmer_count/Saccharolobus_islandicus.stats: global_index_count/index.done
# Acidobacterium_capsulatum
specific_index_presence/Acidobacterium_capsulatum/index.done stats/specific_kmer_presence/Acidobacterium_capsulatum.stats: global_index_presence/index.done
specific_index_count/Acidobacterium_capsulatum/index.done stats/specific_kmer_count/Acidobacterium_capsulatum.stats: global_index_count/index.done
# Proteus_mirabilis
specific_index_presence/Proteus_mirabilis/index.done stats/specific_kmer_presence/Proteus_mirabilis.stats: global_index_presence/index.done
specific_index_count/Proteus_mirabilis/index.done stats/specific_kmer_count/Proteus_mirabilis.stats: global_index_count/index.done
# Wolbachia_endosymbiont
specific_index_presence/Wolbachia_endosymbiont/index.done stats/specific_kmer_presence/Wolbachia_endosymbiont.stats: global_index_presence/index.done
specific_index_count/Wolbachia_endosymbiont/index.done stats/specific_kmer_count/Wolbachia_endosymbiont.stats: global_index_count/index.done
# Yersinia_ruckeri
specific_index_presence/Yersinia_ruckeri/index.done stats/specific_kmer_presence/Yersinia_ruckeri.stats: global_index_presence/index.done
specific_index_count/Yersinia_ruckeri/index.done stats/specific_kmer_count/Yersinia_ruckeri.stats: global_index_count/index.done
# Candidozyma_auris
specific_index_presence/Candidozyma_auris/index.done stats/specific_kmer_presence/Candidozyma_auris.stats: global_index_presence/index.done
specific_index_count/Candidozyma_auris/index.done stats/specific_kmer_count/Candidozyma_auris.stats: global_index_count/index.done
+48
View File
@@ -0,0 +1,48 @@
#!/usr/bin/env bash
set -euo pipefail
assemblies=(
GCF_000005845.2
GCF_000010245.2
GCF_000007445.1
GCF_000006665.1
GCF_000006945.2
GCF_000195995.1
GCF_000009505.1
GCF_000026565.1
GCF_000016305.1
GCF_000019965.1
GCF_000240185.1
GCF_000742135.1
GCF_000069965.1
GCF_000022565.1
GCF_000306885.1
GCF_003013715.1
GCF_000009045.1
GCF_000009825.1
GCF_000022445.1
GCF_000834255.1
)
mkdir -p genomes
for acc in "${assemblies[@]}"; do
echo "Downloading ${acc}"
datasets download genome accession "${acc}" \
--include genome \
--filename "${acc}.zip"
unzip -q "${acc}.zip" -d "${acc}"
find "${acc}" -name "*.fna" |
while read file; do
obiconvert -Z ${file} >genomes/$(basename ${file}).gz
done
rm -rf "${acc}" "${acc}.zip"
done
+108
View File
@@ -0,0 +1,108 @@
#!/usr/bin/env bash
# Usage: filter_one_count.sh SPECIES
# Filters global_index_count to keep only kmers specific to SPECIES,
# then selects the SPECIES column in-place.
# Outputs:
# specific_index_count/SPECIES/index.done (written by obikmer select)
# stats/specific_kmer_count/SPECIES.stats (one CSV data row, no header)
set -euo pipefail
SPECIES="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
SOURCE="${SCRIPT_DIR}/global_index_count"
OUTPUT="${SCRIPT_DIR}/specific_index_count/${SPECIES}"
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_count"
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIES}] filter (count) → ${OUTPUT}"
LOG_FILTER=$(mktemp)
LOG_SELECT=$(mktemp)
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
"${BINARY}" filter \
--output "${OUTPUT}" \
--force \
--ingroup "species=${SPECIES}" \
--outgroup all \
--min-frac 0.5 \
--max-frac 1.0 \
--max-outgroup-count 0 \
"${SOURCE}" \
2>"${LOG_FILTER}"
cat "${LOG_FILTER}" >&2
"${BINARY}" select \
--in-place \
--group "${SPECIES}:species=${SPECIES}" \
--group-op "${SPECIES}:any" \
--select "${SPECIES}" \
"${OUTPUT}" \
2>"${LOG_SELECT}"
cat "${LOG_SELECT}" >&2
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
def parse_reporter(logfile):
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats['TOTAL'] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
return stats
f = parse_reporter(log_filter)
s = parse_reporter(log_select)
row = [species]
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
key = 'TOTAL' if stage.endswith('_total') else stage
w, r = d.get(key, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
print(','.join(row))
PYEOF
+108
View File
@@ -0,0 +1,108 @@
#!/usr/bin/env bash
# Usage: filter_one_presence.sh SPECIES
# Filters global_index_presence to keep only kmers specific to SPECIES,
# then selects the SPECIES column in-place.
# Outputs:
# specific_index_presence/SPECIES/index.done (written by obikmer select)
# stats/specific_kmer_presence/SPECIES.stats (one CSV data row, no header)
set -euo pipefail
SPECIES="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
SOURCE="${SCRIPT_DIR}/global_index_presence"
OUTPUT="${SCRIPT_DIR}/specific_index_presence/${SPECIES}"
STATS_DIR="${SCRIPT_DIR}/stats/specific_kmer_presence"
STATS_FILE="${STATS_DIR}/${SPECIES}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIES}] filter (presence) → ${OUTPUT}"
LOG_FILTER=$(mktemp)
LOG_SELECT=$(mktemp)
trap 'rm -f "${LOG_FILTER}" "${LOG_SELECT}"' EXIT
"${BINARY}" filter \
--output "${OUTPUT}" \
--force \
--ingroup "species=${SPECIES}" \
--outgroup all \
--min-frac 0.5 \
--max-frac 1.0 \
--max-outgroup-count 0 \
"${SOURCE}" \
2>"${LOG_FILTER}"
cat "${LOG_FILTER}" >&2
"${BINARY}" select \
--in-place \
--group "${SPECIES}:species=${SPECIES}" \
--group-op "${SPECIES}:any" \
--select "${SPECIES}" \
"${OUTPUT}" \
2>"${LOG_SELECT}"
cat "${LOG_SELECT}" >&2
python3 - "${SPECIES}" "${LOG_FILTER}" "${LOG_SELECT}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, log_filter, log_select = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
def parse_reporter(logfile):
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats['TOTAL'] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
return stats
f = parse_reporter(log_filter)
s = parse_reporter(log_select)
row = [species]
for stage, d in [('rebuild', f), ('pack', f), ('filter_total', f), ('select', s), ('select_total', s)]:
key = 'TOTAL' if stage.endswith('_total') else stage
w, r = d.get(key, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
print(','.join(row))
PYEOF
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
# Usage: index_one_count.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Outputs:
# specimen_index_count/SPECIMEN/index.done (written by obikmer)
# stats/indexing_count/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
r1="${READS_DIR}/reads_R1.fastq.gz"
r2="${READS_DIR}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "ERROR: reads not found in ${READS_DIR}" >&2
exit 1
fi
echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" index \
--output "${INDEX_PATH}" \
--force \
--theta 0 \
--with-counts \
--label "${SPECIMEN}" \
--meta "species=${species}" \
"${r1}" "${r2}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
row = [species, strain]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
+102
View File
@@ -0,0 +1,102 @@
#!/usr/bin/env bash
# Usage: index_one_presence.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Outputs:
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
r1="${READS_DIR}/reads_R1.fastq.gz"
r2="${READS_DIR}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "ERROR: reads not found in ${READS_DIR}" >&2
exit 1
fi
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" index \
--output "${INDEX_PATH}" \
--force \
--theta 0 \
--label "${SPECIMEN}" \
--meta "species=${species}" \
"${r1}" "${r2}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
import sys, re
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s): state = 'rows'
elif state == 'rows':
if is_sep(s): state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
row = [species, strain]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
+118
View File
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
Like C .d files: only target: prerequisites lines, no recipes.
Recipes stay in the Makefile as generic rules.
"""
import gzip
import re
import sys
from pathlib import Path
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
'endosymbiont', 'of'}
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
def is_stop(tok):
t = tok.lower()
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
def sanitize(s):
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
def collect_tokens(text):
parts = []
for tok in text.split():
tok = tok.rstrip(',.')
if is_stop(tok):
break
parts.append(sanitize(tok))
return '_'.join(filter(None, parts))
def parse_organism(defn, gcf_id):
words = defn.split()
species = sanitize(words[0] + '_' + words[1])
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
if m:
strain = sanitize(m.group(1))
if m.group(2):
strain += '_' + sanitize(m.group(2))
return species, strain
m = re.search(r'\bstrain\b\s+(.*)', defn)
if m:
strain = collect_tokens(m.group(1))
if strain:
return species, strain
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
strain = collect_tokens(remainder)
return species, strain if strain else gcf_id
def first_definition(path):
with gzip.open(path, 'rt') as fh:
for line in fh:
if line.startswith('>'):
m = re.search(r'"definition":"([^"]*)"', line)
return m.group(1) if m else line[1:].split()[0]
return Path(path).stem
def main():
entries = [] # (specimen, species, sim_dir, genome_path)
species_seen = []
for path in sorted(sys.argv[1:]):
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
defn = first_definition(path)
sp, st = parse_organism(defn, gcf_id)
specimen = f'{sp}--{st}'
sim_dir = f'simulated_data/{sp}/{st}'
entries.append((specimen, sp, sim_dir, path))
if sp not in species_seen:
species_seen.append(sp)
specimens = [e[0] for e in entries]
print('SPECIMENS :=', ' '.join(specimens))
print('SPECIES :=', ' '.join(species_seen))
for specimen, species, sim_dir, genome in entries:
reads = f'{sim_dir}/reads_R1.fastq.gz'
p_done = f'specimen_index_presence/{specimen}/index.done'
p_stats = f'stats/indexing_presence/{specimen}.stats'
c_done = f'specimen_index_count/{specimen}/index.done'
c_stats = f'stats/indexing_count/{specimen}.stats'
ref = f'reference_index/{specimen}.npz'
vp = f'stats/verify_presence/{specimen}.stats'
vc = f'stats/verify_count/{specimen}.stats'
print()
print(f'# {specimen}')
print(f'{reads}: {genome}')
print(f'{ref}: {reads}')
print(f'{p_done} {p_stats}: {reads}')
print(f'{c_done} {c_stats}: {reads}')
print(f'{vp}: {ref} {p_done}')
print(f'{vc}: {ref} {c_done}')
print()
for sp in species_seen:
sp_done = f'specific_index_presence/{sp}/index.done'
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
sc_done = f'specific_index_count/{sp}/index.done'
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
print(f'# {sp}')
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
if __name__ == '__main__':
main()
+103
View File
@@ -0,0 +1,103 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
OUTPUT="${SCRIPT_DIR}/global_index_count"
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
mkdir -p "${STATS_DIR}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
parse_reporter() {
local run="$1" n_sources="$2" logfile="$3"
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
import sys, re
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s):
state = 'rows'
elif state == 'rows':
if is_sep(s):
state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
row = [run, n_sources]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
}
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
if [[ ${#sources[@]} -eq 0 ]]; then
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
exit 1
fi
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
printf ' %s\n' "${sources[@]}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" merge \
--output "${OUTPUT}" \
--force \
"${sources[@]}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
echo "Done. Run ${run_n}${CSV}"
+104
View File
@@ -0,0 +1,104 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
IDX_DIR="${SCRIPT_DIR}/specimen_index_presence"
OUTPUT="${SCRIPT_DIR}/global_index_presence"
STATS_DIR="${SCRIPT_DIR}/stats/merge_presence"
mkdir -p "${STATS_DIR}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
parse_reporter() {
local run="$1" n_sources="$2" logfile="$3"
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
import sys, re
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
def strip_ansi(s):
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
def parse_wall(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1000.0
if s.endswith('s'): return float(s[:-1])
return 0.0
def parse_rss(s):
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
if not m: return 0
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
def is_sep(s):
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
stats = {}
state = 'scan'
with open(logfile, errors='replace') as fh:
for raw in fh:
line = strip_ansi(raw.rstrip('\n'))
s = line.strip()
if state == 'scan':
if re.search(r'\bstage\b.*\bwall\b', line):
state = 'in_header'
elif state == 'in_header':
if is_sep(s):
state = 'rows'
elif state == 'rows':
if is_sep(s):
state = 'total'
elif s:
parts = re.split(r' +', s)
if len(parts) >= 4:
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
elif state == 'total':
if s:
parts = re.split(r' +', s)
if len(parts) >= 3:
stats[parts[0]] = (parse_wall(parts[1]),
parse_rss(parts[3]) if len(parts) > 3 else 0)
break
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
row = [run, n_sources]
for stage in STAGE_ORDER:
w, r = stats.get(stage, ('', ''))
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
tw, tr = stats.get('TOTAL', ('', ''))
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
print(','.join(row))
PYEOF
}
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
if [[ ${#sources[@]} -eq 0 ]]; then
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
exit 1
fi
echo "Merging ${#sources[@]} presence indexes → ${OUTPUT}"
printf ' %s\n' "${sources[@]}"
STDERR_LOG=$(mktemp)
trap 'rm -f "${STDERR_LOG}"' EXIT
"${BINARY}" merge \
--output "${OUTPUT}" \
--force \
--force-presence \
"${sources[@]}" \
2>"${STDERR_LOG}"
cat "${STDERR_LOG}" >&2
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
echo "Done. Run ${run_n}${CSV}"
+12
View File
@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# Simulate all genomes. Delegates to simulate_one.sh per genome.
# Prefer running via `gmake simulate` which handles individual dependencies.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for genome_file in "${SCRIPT_DIR}"/genomes/*.fna.gz; do
out_dir=$("${SCRIPT_DIR}/../.venv/bin/python3" "${SCRIPT_DIR}/make_deps.py" \
--dir-for "${genome_file}")
bash "${SCRIPT_DIR}/simulate_one.sh" "${genome_file}" "${out_dir}"
done
+33
View File
@@ -0,0 +1,33 @@
#!/usr/bin/env bash
# Usage: simulate_one.sh genome.fna.gz output_dir
# Simulates paired-end HiSeq reads for a single genome.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
COVERAGE=15
READ_LENGTH=150
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
genome_file="$1"
out_dir="$2"
mkdir -p "${out_dir}"
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
trap 'rm -f "${tmp_fasta}"' EXIT
gzip -dc "${genome_file}" > "${tmp_fasta}"
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
"${ISS}" generate \
--genomes "${tmp_fasta}" \
--model HiSeq \
--n_reads "${n_reads}" \
--cpus "${CPUS}" \
--compress \
--output "${out_dir}/reads"
+181
View File
@@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""Compare an obikmer count index against a reference kmer set (presence + counts).
Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
streams `obikmer dump` from a --with-counts index, then reports:
- false negatives : kmers in reference absent from the index
- false positives : kmers in the index absent from the reference
- count mismatches: kmers present in both but with differing counts
Output to stdout: one CSV row
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
fn_pct,fp_pct,cm_pct
"""
import argparse
import subprocess
import sys
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── dump parsing ──────────────────────────────────────────────────────────────
def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
"""Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
kmers, counts = [], []
header = True
for line in proc.stdout:
if header:
header = False
continue
parts = line.rstrip('\n').split(',')
kmers.append(encode_kmer(parts[0]))
counts.append(int(parts[1]))
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
return (np.array(kmers, dtype=np.uint64)[order],
np.array(counts, dtype=np.uint32)[order])
# ── comparison ────────────────────────────────────────────────────────────────
def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
idx_kmers: np.ndarray, idx_counts: np.ndarray,
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
"""Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).
All arrays sorted; cm_* cover kmers present in both arrays but with
differing counts.
"""
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
# Count mismatches among shared kmers.
# Both arrays are sorted so we can use searchsorted.
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
shared_ref_counts = ref_counts[shared_mask]
shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
mismatch_mask = shared_ref_counts != shared_idx_counts
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
cm_ref_counts = shared_ref_counts[mismatch_mask]
cm_idx_counts = shared_idx_counts[mismatch_mask]
return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reference', metavar='REF_NPZ', nargs='?',
help='Reference .npz file')
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='obikmer index directory (built with --with-counts)')
ap.add_argument('--obikmer', default='obikmer',
help='Path to obikmer binary')
ap.add_argument('--species', default='')
ap.add_argument('--strain', default='')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fp', metavar='FILE',
help='Save false-positive kmer strings to FILE')
ap.add_argument('--save-fn', metavar='FILE',
help='Save false-negative kmer strings to FILE')
ap.add_argument('--save-cm', metavar='FILE',
help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,count_mismatch,'
'fn_pct,fp_pct,cm_pct')
return
# Detect k
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
# Load reference
print(f'Loading reference: {args.reference}', file=sys.stderr)
npz = np.load(args.reference)
ref_kmers = npz['kmers'] # sorted uint64
ref_counts = npz['counts'] # uint32
# Load index
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
idx_kmers, idx_counts = load_index(args.obikmer, args.index)
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
ref_kmers, ref_counts, idx_kmers, idx_counts)
n_shared = len(ref_kmers) - len(false_neg)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
print(f'false negatives : {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
print(f'false positives : {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
print(f'count mismatches: {len(cm_kmers):,} ({cm_pct:.4f}% of shared)',
file=sys.stderr)
if args.save_fn and len(false_neg):
with open(args.save_fn, 'w') as fh:
for v in false_neg:
fh.write(decode_kmer(int(v), k) + '\n')
if args.save_fp and len(false_pos):
with open(args.save_fp, 'w') as fh:
for v in false_pos:
fh.write(decode_kmer(int(v), k) + '\n')
if args.save_cm and len(cm_kmers):
with open(args.save_cm, 'w') as fh:
fh.write('kmer,ref_count,idx_count\n')
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')
print(f'{args.species},{args.strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
if __name__ == '__main__':
main()
+201
View File
@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""Verify the merged count index against all per-specimen reference sets.
Streams `obikmer dump` once on the merged index, accumulates per-specimen
kmer+count pairs from each column, then compares each against its reference .npz.
Output to stdout: one CSV row per specimen (same columns as verify_count.py)
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
fn_pct,fp_pct,cm_pct
"""
import argparse
import subprocess
import sys
from pathlib import Path
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── single-pass dump ──────────────────────────────────────────────────────────
def stream_merged_dump(obikmer_bin: str, index_dir: str,
) -> tuple[list[str], dict[str, tuple[list[int], list[int]]]]:
"""Stream the merged dump once.
Returns:
specimen_names : column labels in dump order
per_specimen : mapping label → (kmer_ints, counts) for entries > 0
"""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
header_line = proc.stdout.readline().rstrip('\n')
cols = header_line.split(',')
specimen_names = cols[1:]
per_specimen: dict[str, tuple[list[int], list[int]]] = {
name: ([], []) for name in specimen_names}
for line in proc.stdout:
parts = line.rstrip('\n').split(',')
kmer_int = encode_kmer(parts[0])
for i, name in enumerate(specimen_names):
count = int(parts[i + 1])
if count > 0:
per_specimen[name][0].append(kmer_int)
per_specimen[name][1].append(count)
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
return specimen_names, per_specimen
# ── per-specimen comparison ───────────────────────────────────────────────────
def compare_specimen(name: str,
kmer_list: list[int],
count_list: list[int],
ref_dir: Path,
k: int,
save_fn: Path | None,
save_fp: Path | None,
save_cm: Path | None,
) -> str:
ref_path = ref_dir / f'{name}.npz'
if not ref_path.exists():
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
return ''
species = name.split('--')[0]
strain = name[len(species) + 2:]
npz = np.load(ref_path)
ref_kmers = npz['kmers'] # sorted uint64
ref_counts = npz['counts'] # uint32
order = np.argsort(np.array(kmer_list, dtype=np.uint64), kind='stable')
idx_kmers = np.array(kmer_list, dtype=np.uint64)[order]
idx_counts = np.array(count_list, dtype=np.uint32)[order]
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
# Count mismatches among shared kmers
pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
shared_mask = idx_kmers[pos_in_idx] == ref_kmers
mismatch_mask = ref_counts[shared_mask] != idx_counts[pos_in_idx[shared_mask]]
cm_kmers = ref_kmers[shared_mask][mismatch_mask]
cm_ref = ref_counts[shared_mask][mismatch_mask]
cm_idx = idx_counts[pos_in_idx[shared_mask]][mismatch_mask]
n_shared = int(shared_mask.sum())
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
cm_pct = 100.0 * len(cm_kmers) / n_shared if n_shared else 0.0
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
f'fp={len(false_pos):,} ({fp_pct:.4f}%) '
f'cm={len(cm_kmers):,} ({cm_pct:.4f}%)',
file=sys.stderr)
if save_fn and len(false_neg):
fn_file = save_fn / f'{name}_fn.txt'
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
if save_fp and len(false_pos):
fp_file = save_fp / f'{name}_fp.txt'
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
if save_cm and len(cm_kmers):
cm_file = save_cm / f'{name}_cm.csv'
lines = ['kmer,ref_count,idx_count']
for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
lines.append(f'{decode_kmer(int(v), k)},{rc},{ic}')
cm_file.write_text('\n'.join(lines) + '\n')
return (f'{species},{strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='Merged count index directory')
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
help='Directory containing per-specimen .npz reference files')
ap.add_argument('--obikmer', default='obikmer')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fn', metavar='DIR',
help='Directory for false-negative kmer lists')
ap.add_argument('--save-fp', metavar='DIR',
help='Directory for false-positive kmer lists')
ap.add_argument('--save-cm', metavar='DIR',
help='Directory for count-mismatch CSV files')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,count_mismatch,'
'fn_pct,fp_pct,cm_pct')
return
ref_dir = Path(args.ref_dir)
save_fn = Path(args.save_fn) if args.save_fn else None
save_fp = Path(args.save_fp) if args.save_fp else None
save_cm = Path(args.save_cm) if args.save_cm else None
for d in (save_fn, save_fp, save_cm):
if d: d.mkdir(parents=True, exist_ok=True)
out1 = subprocess.check_output(
[args.obikmer, 'dump', '--head', '1', args.index],
stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
for name in specimen_names:
kmers, counts = per_specimen[name]
row = compare_specimen(name, kmers, counts, ref_dir, k,
save_fn, save_fp, save_cm)
if row:
print(row)
if __name__ == '__main__':
main()
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
INDEX="${SCRIPT_DIR}/global_index_count"
REF_DIR="${SCRIPT_DIR}/reference_index"
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_count"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_merge_count.py"
mkdir -p "${STATS_DIR}"
CURRENT="${STATS_DIR}/current.csv"
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
"${INDEX}" "${REF_DIR}" \
>>"${CURRENT}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'count_*.csv' | wc -l | tr -d ' ')")
ARCHIVE="${STATS_DIR}/count_${run_n}.csv"
cp "${CURRENT}" "${ARCHIVE}"
echo "Done. Results → ${ARCHIVE}"
+170
View File
@@ -0,0 +1,170 @@
#!/usr/bin/env python3
"""Verify the merged presence index against all per-specimen reference sets.
Streams `obikmer dump` once on the merged index, accumulates per-specimen
kmer sets from each column, then compares each against its reference .npz.
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
"""
import argparse
import subprocess
import sys
from pathlib import Path
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── single-pass dump ──────────────────────────────────────────────────────────
def stream_merged_dump(obikmer_bin: str, index_dir: str,
) -> tuple[list[str], dict[str, list[int]]]:
"""Stream the merged dump once.
Returns:
specimen_names : column labels in dump order (excluding 'kmer')
per_specimen : mapping label → list of kmer ints where presence > 0
"""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
header_line = proc.stdout.readline().rstrip('\n')
cols = header_line.split(',')
specimen_names = cols[1:] # first col is 'kmer'
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
for line in proc.stdout:
parts = line.rstrip('\n').split(',')
kmer_int = encode_kmer(parts[0])
for i, name in enumerate(specimen_names):
if int(parts[i + 1]) > 0:
per_specimen[name].append(kmer_int)
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
return specimen_names, per_specimen
# ── per-specimen comparison ───────────────────────────────────────────────────
def compare_specimen(name: str,
kmer_list: list[int],
ref_dir: Path,
k: int,
save_fn: Path | None,
save_fp: Path | None,
) -> str:
"""Compare one specimen column against its reference .npz.
Returns a CSV row string.
"""
ref_path = ref_dir / f'{name}.npz'
if not ref_path.exists():
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
return ''
species = name.split('--')[0]
strain = name[len(species) + 2:]
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
file=sys.stderr)
if save_fn and len(false_neg):
fn_file = save_fn / f'{name}_fn.txt'
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
if save_fp and len(false_pos):
fp_file = save_fp / f'{name}_fp.txt'
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
return (f'{species},{strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},'
f'{fn_pct:.4f},{fp_pct:.4f}')
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
help='Merged presence index directory')
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
help='Directory containing per-specimen .npz reference files')
ap.add_argument('--obikmer', default='obikmer')
ap.add_argument('--header', action='store_true',
help='Print CSV header and exit')
ap.add_argument('--save-fn', metavar='DIR',
help='Directory to save false-negative kmer lists')
ap.add_argument('--save-fp', metavar='DIR',
help='Directory to save false-positive kmer lists')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,fn_pct,fp_pct')
return
ref_dir = Path(args.ref_dir)
save_fn = Path(args.save_fn) if args.save_fn else None
save_fp = Path(args.save_fp) if args.save_fp else None
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
# Detect k
out1 = subprocess.check_output(
[args.obikmer, 'dump', '--head', '1', args.index],
stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
for name in specimen_names:
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
if row:
print(row)
if __name__ == '__main__':
main()
+27
View File
@@ -0,0 +1,27 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
INDEX="${SCRIPT_DIR}/global_index_presence"
REF_DIR="${SCRIPT_DIR}/reference_index"
STATS_DIR="${SCRIPT_DIR}/stats/verify_merge_presence"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_merge_presence.py"
mkdir -p "${STATS_DIR}"
CURRENT="${STATS_DIR}/current.csv"
"${PYTHON}" "${VERIFY_PY}" --header >"${CURRENT}"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
"${INDEX}" "${REF_DIR}" \
>>"${CURRENT}"
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'presence_*.csv' | wc -l | tr -d ' ')")
ARCHIVE="${STATS_DIR}/presence_${run_n}.csv"
cp "${CURRENT}" "${ARCHIVE}"
echo "Done. Results → ${ARCHIVE}"
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Usage: verify_one_count.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Output: stats/verify_count/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_count.py"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
INDEX_DIR="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/verify_count"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIMEN}] verifying count"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
--species "${species}" \
--strain "${strain}" \
"${REF_NPZ}" "${INDEX_DIR}" \
>"${STATS_FILE}"
+30
View File
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
# Usage: verify_one_presence.sh SPECIMEN
# SPECIMEN = "species--strain" (Make pattern stem)
# Output: stats/verify_presence/SPECIMEN.stats (one CSV data row, no header)
set -euo pipefail
SPECIMEN="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
VERIFY_PY="${SCRIPT_DIR}/verify_presence.py"
species="${SPECIMEN%%--*}"
strain="${SPECIMEN#*--}"
REF_NPZ="${SCRIPT_DIR}/reference_index/${SPECIMEN}.npz"
INDEX_DIR="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
STATS_DIR="${SCRIPT_DIR}/stats/verify_presence"
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
mkdir -p "${STATS_DIR}"
echo "[${SPECIMEN}] verifying presence"
"${PYTHON}" "${VERIFY_PY}" \
--obikmer "${BINARY}" \
--species "${species}" \
--strain "${strain}" \
"${REF_NPZ}" "${INDEX_DIR}" \
>"${STATS_FILE}"
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""Compare an obikmer index against a reference kmer set (presence/absence).
Loads the reference .npz (sorted uint64 kmers built by build_reference.py),
streams the output of `obikmer dump`, encodes each kmer string to uint64,
then reports false negatives and false positives using numpy set operations.
Output to stdout: one CSV row
species, strain, ref_kmers, idx_kmers, false_neg, false_pos, fn_pct, fp_pct
"""
import argparse
import subprocess
import sys
import numpy as np
# ── encoding ──────────────────────────────────────────────────────────────────
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
'a': 0, 'c': 1, 'g': 2, 't': 3}
_DECODE = ['A', 'C', 'G', 'T']
def encode_kmer(s: str) -> int:
kmer = 0
for c in s:
kmer = (kmer << 2) | _ENCODE[c]
return kmer
def decode_kmer(val: int, k: int) -> str:
bases = []
for _ in range(k):
bases.append(_DECODE[val & 3])
val >>= 2
return ''.join(reversed(bases))
# ── dump parsing ──────────────────────────────────────────────────────────────
def load_index_kmers(obikmer_bin: str, index_dir: str) -> np.ndarray:
"""Stream `obikmer dump` and return a sorted uint64 array of kmer integers."""
cmd = [obikmer_bin, 'dump', index_dir]
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
text=True)
kmers = []
header = True
for line in proc.stdout:
if header:
header = False
continue
kmer_str = line.split(',', 1)[0]
kmers.append(encode_kmer(kmer_str))
proc.wait()
if proc.returncode != 0:
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
sys.exit(1)
arr = np.array(kmers, dtype=np.uint64)
arr.sort()
return arr
# ── comparison ────────────────────────────────────────────────────────────────
def compare(ref: np.ndarray, idx: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
"""Return (false_negatives, false_positives) as uint64 arrays."""
false_neg = np.setdiff1d(ref, idx, assume_unique=True)
false_pos = np.setdiff1d(idx, ref, assume_unique=True)
return false_neg, false_pos
# ── main ─────────────────────────────────────────────────────────────────────
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument('reference', metavar='REF_NPZ', nargs='?', help='Reference .npz file')
ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='obikmer index directory')
ap.add_argument('--obikmer', default='obikmer', help='Path to obikmer binary')
ap.add_argument('--species', default='', help='Species label for CSV row')
ap.add_argument('--strain', default='', help='Strain label for CSV row')
ap.add_argument('--header', action='store_true', help='Print CSV header and exit')
ap.add_argument('--save-fp', metavar='FILE',
help='Save false-positive kmer strings to FILE')
ap.add_argument('--save-fn', metavar='FILE',
help='Save false-negative kmer strings to FILE')
args = ap.parse_args()
if args.header:
print('species,strain,ref_kmers,idx_kmers,'
'false_neg,false_pos,fn_pct,fp_pct')
return
# Detect k from the index (one cheap call before the full dump).
cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
k = len(out1.splitlines()[1].split(',')[0])
# Load reference
print(f'Loading reference: {args.reference}', file=sys.stderr)
npz = np.load(args.reference)
ref_kmers = npz['kmers'] # already sorted uint64
# Load index
print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
idx_kmers = load_index_kmers(args.obikmer, args.index)
print(f'k={k} ref={len(ref_kmers):,} idx={len(idx_kmers):,}', file=sys.stderr)
false_neg, false_pos = compare(ref_kmers, idx_kmers)
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
print(f'false negatives: {len(false_neg):,} ({fn_pct:.4f}%)', file=sys.stderr)
print(f'false positives: {len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr)
if args.save_fn and len(false_neg):
with open(args.save_fn, 'w') as fh:
for v in false_neg:
fh.write(decode_kmer(int(v), k) + '\n')
print(f'False negatives saved → {args.save_fn}', file=sys.stderr)
if args.save_fp and len(false_pos):
with open(args.save_fp, 'w') as fh:
for v in false_pos:
fh.write(decode_kmer(int(v), k) + '\n')
print(f'False positives saved → {args.save_fp}', file=sys.stderr)
print(f'{args.species},{args.strain},'
f'{len(ref_kmers)},{len(idx_kmers)},'
f'{len(false_neg)},{len(false_pos)},'
f'{fn_pct:.4f},{fp_pct:.4f}')
if __name__ == '__main__':
main()