Files
obikmer/benchmark/Makefile
T
Eric Coissac 469e53b6f5 Add genomic distance benchmarking suite and test data
Introduces scripts to compute and validate pairwise genomic distance matrices across multiple metrics. Updates the Makefile with build and comparison targets, adds .gitignore rules for generated outputs, and includes test CSV matrices and a Newick phylogenetic tree for validating the distance computation pipeline.
2026-06-22 18:24:30 +02:00

231 lines
9.6 KiB
Makefile

# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
BINARY := ../src/target/release/obikmer
VENV_PY := ../.venv/bin/python3
GENOMES := $(wildcard genomes/*.fna.gz)
# SPECIMENS, SPECIES, and the full dependency graph are generated by
# make_deps.py from the genome FASTA headers — like .d files in C.
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
-include deps.mk
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
REF_DIST_CSVS := $(addprefix reference_dist/, \
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
euclidean_dist.csv relfreq_euclidean_dist.csv \
hellinger_dist.csv hellinger_euclidean_dist.csv)
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
hamming_dist.csv hamming_nj.nwk)
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
bray_curtis_dist.csv bray_curtis_nj.nwk \
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
euclidean_dist.csv euclidean_nj.nwk \
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
hellinger_dist.csv hellinger_nj.nwk \
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
DIST_COMPARISON := stats/dist_comparison/summary.csv
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
.NOTPARALLEL:
.PHONY: all simulate reference reference_dist \
obikmer_dist obikmer_dist_presence obikmer_dist_count \
dist_comparison \
index_presence index_count \
aggregate_index_presence aggregate_index_count \
merge_presence merge_count \
verify_presence verify_count \
aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
filter_presence filter_count \
aggregate_filter_presence aggregate_filter_count
verify_merge_presence: stats/verify_merge_presence/current.csv
verify_merge_count: stats/verify_merge_count/current.csv
all: aggregate_verify_presence aggregate_verify_count \
verify_merge_presence verify_merge_count \
aggregate_filter_presence aggregate_filter_count \
dist_comparison
# ── dependency file ───────────────────────────────────────────────────────────
deps.mk: $(GENOMES)
$(VENV_PY) make_deps.py $^ > $@
# ── simulation ────────────────────────────────────────────────────────────────
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
$(SIMULATED_READS):
bash simulate_one.sh $< $(dir $@)
simulate: $(SIMULATED_READS)
# ── reference kmer sets ───────────────────────────────────────────────────────
# Prerequisites (reads → npz) are in deps.mk.
reference_index/%.npz:
bash build_reference.sh $*
reference: $(REF_NPZS)
# ── reference distance matrices ───────────────────────────────────────────────
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
$(VENV_PY) build_reference_dist.py
reference_dist: $(REF_DIST_CSVS)
# ── obikmer distance (presence index) ────────────────────────────────────────
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
mkdir -p obikmer_dist/presence
$(BINARY) distance \
--output obikmer_dist/presence/jaccard \
--metric jaccard --shared-kmers --nj \
global_index_presence
$(BINARY) distance \
--output obikmer_dist/presence/hamming \
--metric hamming --nj \
global_index_presence
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
# ── obikmer distance (count index) ───────────────────────────────────────────
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
mkdir -p obikmer_dist/count
$(BINARY) distance \
--output obikmer_dist/count/jaccard \
--metric jaccard --shared-kmers --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/bray_curtis \
--metric bray-curtis --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/relfreq_bray_curtis \
--metric relfreq-bray-curtis --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/euclidean \
--metric euclidean --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/relfreq_euclidean \
--metric relfreq-euclidean --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/hellinger \
--metric hellinger --nj \
global_index_count
$(BINARY) distance \
--output obikmer_dist/count/hellinger_euclidean \
--metric hellinger-euclidean --nj \
global_index_count
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
obikmer_dist: obikmer_dist_presence obikmer_dist_count
# ── distance comparison ───────────────────────────────────────────────────────
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
dist_comparison: $(DIST_COMPARISON)
# ── per-specimen indexing ─────────────────────────────────────────────────────
# Prerequisites (reads → index.done + .stats) are in deps.mk.
specimen_index_presence/%/index.done \
stats/indexing_presence/%.stats &: $(BINARY)
bash index_one_presence.sh $*
specimen_index_count/%/index.done \
stats/indexing_count/%.stats &: $(BINARY)
bash index_one_count.sh $*
index_presence: $(PRESENCE_DONE)
index_count: $(COUNT_DONE)
# ── indexing stats aggregation ────────────────────────────────────────────────
aggregate_index_presence: $(PRESENCE_STATS)
bash aggregate_stats.sh indexing_presence
aggregate_index_count: $(COUNT_STATS)
bash aggregate_stats.sh indexing_count
# ── global merge ──────────────────────────────────────────────────────────────
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
bash merge_presence.sh
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
bash merge_count.sh
merge_presence: global_index_presence/index.done
merge_count: global_index_count/index.done
# ── per-specimen verification ─────────────────────────────────────────────────
# Prerequisites (index.done + npz → .stats) are in deps.mk.
stats/verify_presence/%.stats:
bash verify_one_presence.sh $*
stats/verify_count/%.stats:
bash verify_one_count.sh $*
verify_presence: $(VERIFY_PRESENCE_STATS)
verify_count: $(VERIFY_COUNT_STATS)
# ── verification stats aggregation ───────────────────────────────────────────
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
bash aggregate_stats.sh verify_presence
aggregate_verify_count: $(VERIFY_COUNT_STATS)
bash aggregate_stats.sh verify_count
# ── species-specific indexes ──────────────────────────────────────────────────
# Prerequisites (global index → specific index) are in deps.mk.
specific_index_presence/%/index.done \
stats/specific_kmer_presence/%.stats &: $(BINARY)
bash filter_one_presence.sh $*
specific_index_count/%/index.done \
stats/specific_kmer_count/%.stats &: $(BINARY)
bash filter_one_count.sh $*
filter_presence: $(SPECIFIC_PRESENCE_DONE)
filter_count: $(SPECIFIC_COUNT_DONE)
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
bash aggregate_stats.sh specific_kmer_presence
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
bash aggregate_stats.sh specific_kmer_count
# ── merged index verification ─────────────────────────────────────────────────
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
bash verify_merge_presence.sh
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
bash verify_merge_count.sh