Add genomic distance benchmarking suite and test data
Introduces scripts to compute and validate pairwise genomic distance matrices across multiple metrics. Updates the Makefile with build and comparison targets, adds .gitignore rules for generated outputs, and includes test CSV matrices and a Newick phylogenetic tree for validating the distance computation pipeline.
This commit is contained in:
+88
-2
@@ -10,6 +10,23 @@ GENOMES := $(wildcard genomes/*.fna.gz)
|
||||
-include deps.mk
|
||||
|
||||
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||
REF_DIST_CSVS := $(addprefix reference_dist/, \
|
||||
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
|
||||
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
|
||||
euclidean_dist.csv relfreq_euclidean_dist.csv \
|
||||
hellinger_dist.csv hellinger_euclidean_dist.csv)
|
||||
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
|
||||
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||
hamming_dist.csv hamming_nj.nwk)
|
||||
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
|
||||
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||
bray_curtis_dist.csv bray_curtis_nj.nwk \
|
||||
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
|
||||
euclidean_dist.csv euclidean_nj.nwk \
|
||||
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
|
||||
hellinger_dist.csv hellinger_nj.nwk \
|
||||
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
|
||||
DIST_COMPARISON := stats/dist_comparison/summary.csv
|
||||
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||
@@ -24,7 +41,9 @@ SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/read
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
.PHONY: all simulate reference \
|
||||
.PHONY: all simulate reference reference_dist \
|
||||
obikmer_dist obikmer_dist_presence obikmer_dist_count \
|
||||
dist_comparison \
|
||||
index_presence index_count \
|
||||
aggregate_index_presence aggregate_index_count \
|
||||
merge_presence merge_count \
|
||||
@@ -39,7 +58,8 @@ verify_merge_count: stats/verify_merge_count/current.csv
|
||||
|
||||
all: aggregate_verify_presence aggregate_verify_count \
|
||||
verify_merge_presence verify_merge_count \
|
||||
aggregate_filter_presence aggregate_filter_count
|
||||
aggregate_filter_presence aggregate_filter_count \
|
||||
dist_comparison
|
||||
|
||||
# ── dependency file ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -62,6 +82,72 @@ reference_index/%.npz:
|
||||
|
||||
reference: $(REF_NPZS)
|
||||
|
||||
# ── reference distance matrices ───────────────────────────────────────────────
|
||||
|
||||
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
|
||||
$(VENV_PY) build_reference_dist.py
|
||||
|
||||
reference_dist: $(REF_DIST_CSVS)
|
||||
|
||||
# ── obikmer distance (presence index) ────────────────────────────────────────
|
||||
|
||||
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
|
||||
mkdir -p obikmer_dist/presence
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/presence/jaccard \
|
||||
--metric jaccard --shared-kmers --nj \
|
||||
global_index_presence
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/presence/hamming \
|
||||
--metric hamming --nj \
|
||||
global_index_presence
|
||||
|
||||
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
|
||||
|
||||
# ── obikmer distance (count index) ───────────────────────────────────────────
|
||||
|
||||
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
|
||||
mkdir -p obikmer_dist/count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/jaccard \
|
||||
--metric jaccard --shared-kmers --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/bray_curtis \
|
||||
--metric bray-curtis --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/relfreq_bray_curtis \
|
||||
--metric relfreq-bray-curtis --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/euclidean \
|
||||
--metric euclidean --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/relfreq_euclidean \
|
||||
--metric relfreq-euclidean --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/hellinger \
|
||||
--metric hellinger --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/hellinger_euclidean \
|
||||
--metric hellinger-euclidean --nj \
|
||||
global_index_count
|
||||
|
||||
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
|
||||
|
||||
obikmer_dist: obikmer_dist_presence obikmer_dist_count
|
||||
|
||||
# ── distance comparison ───────────────────────────────────────────────────────
|
||||
|
||||
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
|
||||
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
|
||||
|
||||
dist_comparison: $(DIST_COMPARISON)
|
||||
|
||||
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user