2026-06-19 09:55:41 +02:00
|
|
|
# Requires GNU Make >= 4.3 (grouped targets &:) — use gmake on macOS
|
|
|
|
|
BINARY := ../src/target/release/obikmer
|
|
|
|
|
VENV_PY := ../.venv/bin/python3
|
|
|
|
|
|
|
|
|
|
GENOMES := $(wildcard genomes/*.fna.gz)
|
|
|
|
|
|
|
|
|
|
# SPECIMENS, SPECIES, and the full dependency graph are generated by
|
|
|
|
|
# make_deps.py from the genome FASTA headers — like .d files in C.
|
|
|
|
|
# Make rebuilds deps.mk whenever genomes/ changes and restarts.
|
|
|
|
|
-include deps.mk
|
|
|
|
|
|
|
|
|
|
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
2026-06-22 17:28:48 +02:00
|
|
|
REF_DIST_CSVS := $(addprefix reference_dist/, \
|
|
|
|
|
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
|
|
|
|
|
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
|
|
|
|
|
euclidean_dist.csv relfreq_euclidean_dist.csv \
|
|
|
|
|
hellinger_dist.csv hellinger_euclidean_dist.csv)
|
|
|
|
|
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
|
|
|
|
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
|
|
|
|
hamming_dist.csv hamming_nj.nwk)
|
|
|
|
|
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
|
|
|
|
|
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
|
|
|
|
bray_curtis_dist.csv bray_curtis_nj.nwk \
|
|
|
|
|
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
|
|
|
|
|
euclidean_dist.csv euclidean_nj.nwk \
|
|
|
|
|
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
|
|
|
|
|
hellinger_dist.csv hellinger_nj.nwk \
|
|
|
|
|
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
|
|
|
|
|
DIST_COMPARISON := stats/dist_comparison/summary.csv
|
2026-06-19 09:55:41 +02:00
|
|
|
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
|
|
|
|
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
|
|
|
|
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
|
|
|
|
COUNT_STATS := $(SPECIMENS:%=stats/indexing_count/%.stats)
|
|
|
|
|
VERIFY_PRESENCE_STATS := $(SPECIMENS:%=stats/verify_presence/%.stats)
|
|
|
|
|
VERIFY_COUNT_STATS := $(SPECIMENS:%=stats/verify_count/%.stats)
|
|
|
|
|
SPECIFIC_PRESENCE_DONE := $(SPECIES:%=specific_index_presence/%/index.done)
|
|
|
|
|
SPECIFIC_PRESENCE_STATS := $(SPECIES:%=stats/specific_kmer_presence/%.stats)
|
|
|
|
|
SPECIFIC_COUNT_DONE := $(SPECIES:%=specific_index_count/%/index.done)
|
|
|
|
|
SPECIFIC_COUNT_STATS := $(SPECIES:%=stats/specific_kmer_count/%.stats)
|
|
|
|
|
SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/reads_R1.fastq.gz)
|
|
|
|
|
|
|
|
|
|
.NOTPARALLEL:
|
|
|
|
|
|
2026-06-22 17:28:48 +02:00
|
|
|
.PHONY: all simulate reference reference_dist \
|
|
|
|
|
obikmer_dist obikmer_dist_presence obikmer_dist_count \
|
|
|
|
|
dist_comparison \
|
2026-06-19 09:55:41 +02:00
|
|
|
index_presence index_count \
|
|
|
|
|
aggregate_index_presence aggregate_index_count \
|
|
|
|
|
merge_presence merge_count \
|
|
|
|
|
verify_presence verify_count \
|
|
|
|
|
aggregate_verify_presence aggregate_verify_count \
|
|
|
|
|
verify_merge_presence verify_merge_count \
|
|
|
|
|
filter_presence filter_count \
|
|
|
|
|
aggregate_filter_presence aggregate_filter_count
|
|
|
|
|
|
|
|
|
|
verify_merge_presence: stats/verify_merge_presence/current.csv
|
|
|
|
|
verify_merge_count: stats/verify_merge_count/current.csv
|
|
|
|
|
|
|
|
|
|
all: aggregate_verify_presence aggregate_verify_count \
|
|
|
|
|
verify_merge_presence verify_merge_count \
|
2026-06-22 17:28:48 +02:00
|
|
|
aggregate_filter_presence aggregate_filter_count \
|
|
|
|
|
dist_comparison
|
2026-06-19 09:55:41 +02:00
|
|
|
|
|
|
|
|
# ── dependency file ───────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
deps.mk: $(GENOMES)
|
|
|
|
|
$(VENV_PY) make_deps.py $^ > $@
|
|
|
|
|
|
|
|
|
|
# ── simulation ────────────────────────────────────────────────────────────────
|
|
|
|
|
# Prerequisites (genome → reads) are in deps.mk; $< is the genome file.
|
|
|
|
|
|
|
|
|
|
$(SIMULATED_READS):
|
|
|
|
|
bash simulate_one.sh $< $(dir $@)
|
|
|
|
|
|
|
|
|
|
simulate: $(SIMULATED_READS)
|
|
|
|
|
|
|
|
|
|
# ── reference kmer sets ───────────────────────────────────────────────────────
|
|
|
|
|
# Prerequisites (reads → npz) are in deps.mk.
|
|
|
|
|
|
|
|
|
|
reference_index/%.npz:
|
|
|
|
|
bash build_reference.sh $*
|
|
|
|
|
|
|
|
|
|
reference: $(REF_NPZS)
|
|
|
|
|
|
2026-06-22 17:28:48 +02:00
|
|
|
# ── reference distance matrices ───────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
|
|
|
|
|
$(VENV_PY) build_reference_dist.py
|
|
|
|
|
|
|
|
|
|
reference_dist: $(REF_DIST_CSVS)
|
|
|
|
|
|
|
|
|
|
# ── obikmer distance (presence index) ────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
|
|
|
|
|
mkdir -p obikmer_dist/presence
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/presence/jaccard \
|
|
|
|
|
--metric jaccard --shared-kmers --nj \
|
|
|
|
|
global_index_presence
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/presence/hamming \
|
|
|
|
|
--metric hamming --nj \
|
|
|
|
|
global_index_presence
|
|
|
|
|
|
|
|
|
|
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
|
|
|
|
|
|
|
|
|
|
# ── obikmer distance (count index) ───────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
|
|
|
|
|
mkdir -p obikmer_dist/count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/jaccard \
|
|
|
|
|
--metric jaccard --shared-kmers --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/bray_curtis \
|
|
|
|
|
--metric bray-curtis --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/relfreq_bray_curtis \
|
|
|
|
|
--metric relfreq-bray-curtis --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/euclidean \
|
|
|
|
|
--metric euclidean --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/relfreq_euclidean \
|
|
|
|
|
--metric relfreq-euclidean --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/hellinger \
|
|
|
|
|
--metric hellinger --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
$(BINARY) distance \
|
|
|
|
|
--output obikmer_dist/count/hellinger_euclidean \
|
|
|
|
|
--metric hellinger-euclidean --nj \
|
|
|
|
|
global_index_count
|
|
|
|
|
|
|
|
|
|
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
|
|
|
|
|
|
|
|
|
|
obikmer_dist: obikmer_dist_presence obikmer_dist_count
|
|
|
|
|
|
|
|
|
|
# ── distance comparison ───────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
|
|
|
|
|
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
|
|
|
|
|
|
|
|
|
|
dist_comparison: $(DIST_COMPARISON)
|
|
|
|
|
|
2026-06-19 09:55:41 +02:00
|
|
|
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
|
|
|
|
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
|
|
|
|
|
|
|
|
|
specimen_index_presence/%/index.done \
|
|
|
|
|
stats/indexing_presence/%.stats &: $(BINARY)
|
|
|
|
|
bash index_one_presence.sh $*
|
|
|
|
|
|
|
|
|
|
specimen_index_count/%/index.done \
|
|
|
|
|
stats/indexing_count/%.stats &: $(BINARY)
|
|
|
|
|
bash index_one_count.sh $*
|
|
|
|
|
|
|
|
|
|
index_presence: $(PRESENCE_DONE)
|
|
|
|
|
index_count: $(COUNT_DONE)
|
|
|
|
|
|
|
|
|
|
# ── indexing stats aggregation ────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
aggregate_index_presence: $(PRESENCE_STATS)
|
|
|
|
|
bash aggregate_stats.sh indexing_presence
|
|
|
|
|
|
|
|
|
|
aggregate_index_count: $(COUNT_STATS)
|
|
|
|
|
bash aggregate_stats.sh indexing_count
|
|
|
|
|
|
|
|
|
|
# ── global merge ──────────────────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
global_index_presence/index.done: $(PRESENCE_DONE) $(BINARY)
|
|
|
|
|
bash merge_presence.sh
|
|
|
|
|
|
|
|
|
|
global_index_count/index.done: $(COUNT_DONE) $(BINARY)
|
|
|
|
|
bash merge_count.sh
|
|
|
|
|
|
|
|
|
|
merge_presence: global_index_presence/index.done
|
|
|
|
|
merge_count: global_index_count/index.done
|
|
|
|
|
|
|
|
|
|
# ── per-specimen verification ─────────────────────────────────────────────────
|
|
|
|
|
# Prerequisites (index.done + npz → .stats) are in deps.mk.
|
|
|
|
|
|
|
|
|
|
stats/verify_presence/%.stats:
|
|
|
|
|
bash verify_one_presence.sh $*
|
|
|
|
|
|
|
|
|
|
stats/verify_count/%.stats:
|
|
|
|
|
bash verify_one_count.sh $*
|
|
|
|
|
|
|
|
|
|
verify_presence: $(VERIFY_PRESENCE_STATS)
|
|
|
|
|
verify_count: $(VERIFY_COUNT_STATS)
|
|
|
|
|
|
|
|
|
|
# ── verification stats aggregation ───────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
aggregate_verify_presence: $(VERIFY_PRESENCE_STATS)
|
|
|
|
|
bash aggregate_stats.sh verify_presence
|
|
|
|
|
|
|
|
|
|
aggregate_verify_count: $(VERIFY_COUNT_STATS)
|
|
|
|
|
bash aggregate_stats.sh verify_count
|
|
|
|
|
|
|
|
|
|
# ── species-specific indexes ──────────────────────────────────────────────────
|
|
|
|
|
# Prerequisites (global index → specific index) are in deps.mk.
|
|
|
|
|
|
|
|
|
|
specific_index_presence/%/index.done \
|
|
|
|
|
stats/specific_kmer_presence/%.stats &: $(BINARY)
|
|
|
|
|
bash filter_one_presence.sh $*
|
|
|
|
|
|
|
|
|
|
specific_index_count/%/index.done \
|
|
|
|
|
stats/specific_kmer_count/%.stats &: $(BINARY)
|
|
|
|
|
bash filter_one_count.sh $*
|
|
|
|
|
|
|
|
|
|
filter_presence: $(SPECIFIC_PRESENCE_DONE)
|
|
|
|
|
filter_count: $(SPECIFIC_COUNT_DONE)
|
|
|
|
|
|
|
|
|
|
aggregate_filter_presence: $(SPECIFIC_PRESENCE_STATS)
|
|
|
|
|
bash aggregate_stats.sh specific_kmer_presence
|
|
|
|
|
|
|
|
|
|
aggregate_filter_count: $(SPECIFIC_COUNT_STATS)
|
|
|
|
|
bash aggregate_stats.sh specific_kmer_count
|
|
|
|
|
|
|
|
|
|
# ── merged index verification ─────────────────────────────────────────────────
|
|
|
|
|
|
|
|
|
|
stats/verify_merge_presence/current.csv: $(REF_NPZS) global_index_presence/index.done
|
|
|
|
|
bash verify_merge_presence.sh
|
|
|
|
|
|
|
|
|
|
stats/verify_merge_count/current.csv: $(REF_NPZS) global_index_count/index.done
|
|
|
|
|
bash verify_merge_count.sh
|