Add genomic distance benchmarking suite and test data
Introduces scripts to compute and validate pairwise genomic distance matrices across multiple metrics. Updates the Makefile with build and comparison targets, adds .gitignore rules for generated outputs, and includes test CSV matrices and a Newick phylogenetic tree for validating the distance computation pipeline.
This commit is contained in:
+88
-2
@@ -10,6 +10,23 @@ GENOMES := $(wildcard genomes/*.fna.gz)
|
||||
-include deps.mk
|
||||
|
||||
REF_NPZS := $(SPECIMENS:%=reference_index/%.npz)
|
||||
REF_DIST_CSVS := $(addprefix reference_dist/, \
|
||||
shared_kmers.csv hamming_dist.csv jaccard_dist.csv \
|
||||
bray_curtis_dist.csv relfreq_bray_curtis_dist.csv \
|
||||
euclidean_dist.csv relfreq_euclidean_dist.csv \
|
||||
hellinger_dist.csv hellinger_euclidean_dist.csv)
|
||||
OBIKMER_PRESENCE_DIST := $(addprefix obikmer_dist/presence/, \
|
||||
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||
hamming_dist.csv hamming_nj.nwk)
|
||||
OBIKMER_COUNT_DIST := $(addprefix obikmer_dist/count/, \
|
||||
jaccard_dist.csv jaccard_shared.csv jaccard_nj.nwk \
|
||||
bray_curtis_dist.csv bray_curtis_nj.nwk \
|
||||
relfreq_bray_curtis_dist.csv relfreq_bray_curtis_nj.nwk \
|
||||
euclidean_dist.csv euclidean_nj.nwk \
|
||||
relfreq_euclidean_dist.csv relfreq_euclidean_nj.nwk \
|
||||
hellinger_dist.csv hellinger_nj.nwk \
|
||||
hellinger_euclidean_dist.csv hellinger_euclidean_nj.nwk)
|
||||
DIST_COMPARISON := stats/dist_comparison/summary.csv
|
||||
PRESENCE_DONE := $(SPECIMENS:%=specimen_index_presence/%/index.done)
|
||||
PRESENCE_STATS := $(SPECIMENS:%=stats/indexing_presence/%.stats)
|
||||
COUNT_DONE := $(SPECIMENS:%=specimen_index_count/%/index.done)
|
||||
@@ -24,7 +41,9 @@ SIMULATED_READS := $(foreach s,$(SPECIMENS),simulated_data/$(subst --,/,$s)/read
|
||||
|
||||
.NOTPARALLEL:
|
||||
|
||||
.PHONY: all simulate reference \
|
||||
.PHONY: all simulate reference reference_dist \
|
||||
obikmer_dist obikmer_dist_presence obikmer_dist_count \
|
||||
dist_comparison \
|
||||
index_presence index_count \
|
||||
aggregate_index_presence aggregate_index_count \
|
||||
merge_presence merge_count \
|
||||
@@ -39,7 +58,8 @@ verify_merge_count: stats/verify_merge_count/current.csv
|
||||
|
||||
all: aggregate_verify_presence aggregate_verify_count \
|
||||
verify_merge_presence verify_merge_count \
|
||||
aggregate_filter_presence aggregate_filter_count
|
||||
aggregate_filter_presence aggregate_filter_count \
|
||||
dist_comparison
|
||||
|
||||
# ── dependency file ───────────────────────────────────────────────────────────
|
||||
|
||||
@@ -62,6 +82,72 @@ reference_index/%.npz:
|
||||
|
||||
reference: $(REF_NPZS)
|
||||
|
||||
# ── reference distance matrices ───────────────────────────────────────────────
|
||||
|
||||
$(REF_DIST_CSVS) &: $(REF_NPZS) build_reference_dist.py
|
||||
$(VENV_PY) build_reference_dist.py
|
||||
|
||||
reference_dist: $(REF_DIST_CSVS)
|
||||
|
||||
# ── obikmer distance (presence index) ────────────────────────────────────────
|
||||
|
||||
$(OBIKMER_PRESENCE_DIST) &: global_index_presence/index.done $(BINARY)
|
||||
mkdir -p obikmer_dist/presence
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/presence/jaccard \
|
||||
--metric jaccard --shared-kmers --nj \
|
||||
global_index_presence
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/presence/hamming \
|
||||
--metric hamming --nj \
|
||||
global_index_presence
|
||||
|
||||
obikmer_dist_presence: $(OBIKMER_PRESENCE_DIST)
|
||||
|
||||
# ── obikmer distance (count index) ───────────────────────────────────────────
|
||||
|
||||
$(OBIKMER_COUNT_DIST) &: global_index_count/index.done $(BINARY)
|
||||
mkdir -p obikmer_dist/count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/jaccard \
|
||||
--metric jaccard --shared-kmers --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/bray_curtis \
|
||||
--metric bray-curtis --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/relfreq_bray_curtis \
|
||||
--metric relfreq-bray-curtis --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/euclidean \
|
||||
--metric euclidean --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/relfreq_euclidean \
|
||||
--metric relfreq-euclidean --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/hellinger \
|
||||
--metric hellinger --nj \
|
||||
global_index_count
|
||||
$(BINARY) distance \
|
||||
--output obikmer_dist/count/hellinger_euclidean \
|
||||
--metric hellinger-euclidean --nj \
|
||||
global_index_count
|
||||
|
||||
obikmer_dist_count: $(OBIKMER_COUNT_DIST)
|
||||
|
||||
obikmer_dist: obikmer_dist_presence obikmer_dist_count
|
||||
|
||||
# ── distance comparison ───────────────────────────────────────────────────────
|
||||
|
||||
$(DIST_COMPARISON): $(REF_DIST_CSVS) $(OBIKMER_PRESENCE_DIST) $(OBIKMER_COUNT_DIST) compare_all_dist.py
|
||||
$(VENV_PY) compare_all_dist.py --out $(DIST_COMPARISON)
|
||||
|
||||
dist_comparison: $(DIST_COMPARISON)
|
||||
|
||||
# ── per-specimen indexing ─────────────────────────────────────────────────────
|
||||
# Prerequisites (reads → index.done + .stats) are in deps.mk.
|
||||
|
||||
|
||||
Executable
+226
@@ -0,0 +1,226 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compute reference pairwise distance matrices from per-specimen .npz kmer indexes.
|
||||
|
||||
Reads all .npz files in reference_index/ (each containing sorted uint64 `kmers`
|
||||
and uint32 `counts`), computes all distance metrics supported by `obikmer distance`,
|
||||
and writes one CSV per metric to reference_dist/.
|
||||
|
||||
Output CSV format matches `obikmer distance --output`:
|
||||
- first row: "genome", then specimen names
|
||||
- subsequent rows: specimen name, then float or int values
|
||||
|
||||
Metrics written
|
||||
jaccard_dist.csv Jaccard distance (presence/absence)
|
||||
shared_kmers.csv Shared-kmer count matrix (intersection size)
|
||||
bray_curtis_dist.csv Bray-Curtis dissimilarity (raw counts)
|
||||
relfreq_bray_curtis_dist.csv Bray-Curtis on relative frequencies
|
||||
euclidean_dist.csv Euclidean distance (raw counts)
|
||||
relfreq_euclidean_dist.csv Euclidean distance on relative frequencies
|
||||
hellinger_dist.csv Hellinger distance
|
||||
hellinger_euclidean_dist.csv Euclidean distance in Hellinger space
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── pairwise helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def shared_indices(a_kmers: np.ndarray, b_kmers: np.ndarray):
|
||||
"""Return index arrays (idx_a, idx_b) for kmers present in both sets.
|
||||
|
||||
Both arrays must be sorted uint64. Uses searchsorted: O(|B| log |A|).
|
||||
"""
|
||||
pos = np.searchsorted(a_kmers, b_kmers)
|
||||
pos = np.clip(pos, 0, len(a_kmers) - 1)
|
||||
mask = a_kmers[pos] == b_kmers
|
||||
idx_b = np.where(mask)[0]
|
||||
idx_a = pos[idx_b]
|
||||
return idx_a, idx_b
|
||||
|
||||
|
||||
def pairwise_stats(specimens: list[dict]) -> dict[str, np.ndarray]:
|
||||
"""Compute all pairwise distance matrices at once.
|
||||
|
||||
Returns a dict metric_name → ndarray (n×n float64 or int64).
|
||||
Each specimen dict has keys: name, kmers, counts.
|
||||
"""
|
||||
n = len(specimens)
|
||||
|
||||
# Pre-compute per-specimen scalars
|
||||
kmer_counts = np.array([len(s['kmers']) for s in specimens], dtype=np.uint64)
|
||||
count_sums = np.array([s['counts'].sum() for s in specimens], dtype=np.uint64)
|
||||
|
||||
# Per-specimen sum-of-squares (for Euclidean decomposition)
|
||||
sq_sums = np.array([(s['counts'].astype(np.float64) ** 2).sum() for s in specimens])
|
||||
|
||||
# Allocate output matrices
|
||||
shared_mat = np.zeros((n, n), dtype=np.uint64)
|
||||
hamming_mat = np.zeros((n, n), dtype=np.float64)
|
||||
jaccard_mat = np.zeros((n, n), dtype=np.float64)
|
||||
bray_mat = np.zeros((n, n), dtype=np.float64)
|
||||
relfreq_bray = np.zeros((n, n), dtype=np.float64)
|
||||
euclidean_mat = np.zeros((n, n), dtype=np.float64)
|
||||
relfreq_eucl = np.zeros((n, n), dtype=np.float64)
|
||||
hellinger_mat = np.zeros((n, n), dtype=np.float64)
|
||||
hell_eucl_mat = np.zeros((n, n), dtype=np.float64)
|
||||
|
||||
for i in range(n):
|
||||
a_km = specimens[i]['kmers']
|
||||
a_ct = specimens[i]['counts'].astype(np.float64)
|
||||
sa = float(count_sums[i])
|
||||
na = int(kmer_counts[i])
|
||||
|
||||
for j in range(i + 1, n):
|
||||
b_km = specimens[j]['kmers']
|
||||
b_ct = specimens[j]['counts'].astype(np.float64)
|
||||
sb = float(count_sums[j])
|
||||
nb = int(kmer_counts[j])
|
||||
|
||||
idx_a, idx_b = shared_indices(a_km, b_km)
|
||||
inter = len(idx_a)
|
||||
|
||||
ca_sh = a_ct[idx_a]
|
||||
cb_sh = b_ct[idx_b]
|
||||
|
||||
# ── Presence metrics ──────────────────────────────────────────────
|
||||
|
||||
union = na + nb - inter
|
||||
jac = (1.0 - inter / union) if union else 0.0
|
||||
hamming = float(na + nb - 2 * inter) # |A Δ B|
|
||||
|
||||
# ── Count metrics ─────────────────────────────────────────────────
|
||||
|
||||
# Bray-Curtis: 1 - 2*Σmin(a,b) / (Σa + Σb)
|
||||
sum_min = np.minimum(ca_sh, cb_sh).sum()
|
||||
denom_bc = sa + sb
|
||||
bc = (1.0 - 2.0 * sum_min / denom_bc) if denom_bc else 0.0
|
||||
|
||||
# RelfreqBray: 1 - Σmin(a/sa, b/sb) [only shared contribute]
|
||||
if sa and sb:
|
||||
rfb = 1.0 - np.minimum(ca_sh / sa, cb_sh / sb).sum()
|
||||
else:
|
||||
rfb = 0.0
|
||||
|
||||
# Euclidean: √(Σa² + Σb² - 2·Σ(a·b)_shared)
|
||||
cross = (ca_sh * cb_sh).sum()
|
||||
eucl_partial = sq_sums[i] + sq_sums[j] - 2.0 * cross
|
||||
eucl = np.sqrt(max(eucl_partial, 0.0))
|
||||
|
||||
# RelfreqEuclidean: √(Σ(a/sa - b/sb)²)
|
||||
# = √(Σa²/sa² + Σb²/sb² - 2·Σ(a·b)_shared/(sa·sb))
|
||||
if sa and sb:
|
||||
rf_cross = (ca_sh / sa * (cb_sh / sb)).sum()
|
||||
rfe_partial = (sq_sums[i] / sa**2
|
||||
+ sq_sums[j] / sb**2
|
||||
- 2.0 * rf_cross)
|
||||
rfe = np.sqrt(max(rfe_partial, 0.0))
|
||||
else:
|
||||
rfe = 0.0
|
||||
|
||||
# Hellinger partial: Σ(√(a/sa) - √(b/sb))² over global universe
|
||||
# = 2 - 2·Σ√(a·b)_shared / √(sa·sb)
|
||||
if sa and sb:
|
||||
bc_coeff = np.sqrt(ca_sh * cb_sh).sum() / np.sqrt(sa * sb)
|
||||
hell_partial = max(2.0 - 2.0 * bc_coeff, 0.0)
|
||||
else:
|
||||
hell_partial = 0.0
|
||||
|
||||
sq2 = np.sqrt(2.0)
|
||||
hell = np.sqrt(hell_partial) / sq2
|
||||
hell_euc = np.sqrt(hell_partial)
|
||||
|
||||
# ── Fill symmetric matrices ───────────────────────────────────────
|
||||
for mat, val in [
|
||||
(shared_mat, inter),
|
||||
(hamming_mat, hamming),
|
||||
(jaccard_mat, jac),
|
||||
(bray_mat, bc),
|
||||
(relfreq_bray, rfb),
|
||||
(euclidean_mat, eucl),
|
||||
(relfreq_eucl, rfe),
|
||||
(hellinger_mat, hell),
|
||||
(hell_eucl_mat, hell_euc),
|
||||
]:
|
||||
mat[i, j] = val
|
||||
mat[j, i] = val
|
||||
|
||||
return {
|
||||
'shared_kmers': shared_mat,
|
||||
'hamming_dist': hamming_mat,
|
||||
'jaccard_dist': jaccard_mat,
|
||||
'bray_curtis_dist': bray_mat,
|
||||
'relfreq_bray_curtis_dist': relfreq_bray,
|
||||
'euclidean_dist': euclidean_mat,
|
||||
'relfreq_euclidean_dist': relfreq_eucl,
|
||||
'hellinger_dist': hellinger_mat,
|
||||
'hellinger_euclidean_dist': hell_eucl_mat,
|
||||
}
|
||||
|
||||
|
||||
# ── I/O ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def write_csv(path: Path, labels: list[str], mat: np.ndarray, fmt: str) -> None:
|
||||
with path.open('w') as fh:
|
||||
fh.write('genome,' + ','.join(labels) + '\n')
|
||||
for i, label in enumerate(labels):
|
||||
row = ','.join(format(mat[i, j], fmt) for j in range(len(labels)))
|
||||
fh.write(f'{label},{row}\n')
|
||||
print(f' → {path}', file=sys.stderr)
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('--ref-dir', default='reference_index',
|
||||
help='Directory with per-specimen .npz files (default: reference_index)')
|
||||
ap.add_argument('--out-dir', default='reference_dist',
|
||||
help='Output directory for CSV files (default: reference_dist)')
|
||||
args = ap.parse_args()
|
||||
|
||||
ref_dir = Path(args.ref_dir)
|
||||
out_dir = Path(args.out_dir)
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
|
||||
npz_files = sorted(ref_dir.glob('*.npz'))
|
||||
if not npz_files:
|
||||
print(f'ERROR: no .npz files found in {ref_dir}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f'Loading {len(npz_files)} specimen(s) from {ref_dir}/', file=sys.stderr)
|
||||
specimens = []
|
||||
for f in npz_files:
|
||||
data = np.load(f)
|
||||
specimens.append({
|
||||
'name': f.stem,
|
||||
'kmers': data['kmers'],
|
||||
'counts': data['counts'],
|
||||
})
|
||||
print(f' {f.stem}: {len(data["kmers"]):,} kmers', file=sys.stderr)
|
||||
|
||||
labels = [s['name'] for s in specimens]
|
||||
n = len(labels)
|
||||
print(f'\nComputing pairwise distances for {n} specimens…', file=sys.stderr)
|
||||
|
||||
matrices = pairwise_stats(specimens)
|
||||
|
||||
print(f'\nWriting CSVs to {out_dir}/', file=sys.stderr)
|
||||
write_csv(out_dir / 'shared_kmers.csv', labels, matrices['shared_kmers'], 'd')
|
||||
write_csv(out_dir / 'hamming_dist.csv', labels, matrices['hamming_dist'], '.6f')
|
||||
write_csv(out_dir / 'jaccard_dist.csv', labels, matrices['jaccard_dist'], '.6f')
|
||||
write_csv(out_dir / 'bray_curtis_dist.csv', labels, matrices['bray_curtis_dist'], '.6f')
|
||||
write_csv(out_dir / 'relfreq_bray_curtis_dist.csv', labels, matrices['relfreq_bray_curtis_dist'], '.6f')
|
||||
write_csv(out_dir / 'euclidean_dist.csv', labels, matrices['euclidean_dist'], '.6f')
|
||||
write_csv(out_dir / 'relfreq_euclidean_dist.csv', labels, matrices['relfreq_euclidean_dist'], '.6f')
|
||||
write_csv(out_dir / 'hellinger_dist.csv', labels, matrices['hellinger_dist'], '.6f')
|
||||
write_csv(out_dir / 'hellinger_euclidean_dist.csv', labels, matrices['hellinger_euclidean_dist'], '.6f')
|
||||
|
||||
print('\nDone.', file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+182
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare all reference distance matrices against obikmer distance outputs.
|
||||
|
||||
Reads from:
|
||||
reference_dist/ — ground-truth matrices computed by build_reference_dist.py
|
||||
obikmer_dist/ — matrices produced by `obikmer distance`
|
||||
|
||||
Handles label reordering: both matrices are sorted by genome label before
|
||||
element-wise comparison, so column/row order differences are irrelevant.
|
||||
|
||||
Output: stats/dist_comparison/summary.csv
|
||||
comparison,max_abs,mean_abs,rmse,n_pairs,status
|
||||
"""
|
||||
import csv
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
# ── CSV loading ───────────────────────────────────────────────────────────────
|
||||
|
||||
def load_matrix(path: Path) -> tuple[list[str], np.ndarray]:
|
||||
"""Load a distance-matrix CSV; return (sorted_labels, matrix_float64)."""
|
||||
with path.open() as fh:
|
||||
reader = csv.reader(fh)
|
||||
header = next(reader)[1:] # skip 'genome' column
|
||||
raw: dict[str, list[float]] = {}
|
||||
for row in reader:
|
||||
raw[row[0]] = [float(x) for x in row[1:]]
|
||||
|
||||
label_to_col = {h: i for i, h in enumerate(header)}
|
||||
labels = sorted(raw.keys())
|
||||
n = len(labels)
|
||||
mat = np.zeros((n, n), dtype=np.float64)
|
||||
for i, ri in enumerate(labels):
|
||||
for j, cj in enumerate(labels):
|
||||
mat[i, j] = raw[ri][label_to_col[cj]]
|
||||
return labels, mat
|
||||
|
||||
|
||||
# ── comparison ────────────────────────────────────────────────────────────────
|
||||
|
||||
def compare(label: str,
|
||||
ref_path: Path,
|
||||
obi_path: Path,
|
||||
tol: float = 1e-4) -> dict:
|
||||
if not ref_path.exists():
|
||||
return {'comparison': label, 'status': 'REF_MISSING',
|
||||
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||
if not obi_path.exists():
|
||||
return {'comparison': label, 'status': 'OBI_MISSING',
|
||||
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||
|
||||
ref_labels, ref_mat = load_matrix(ref_path)
|
||||
obi_labels, obi_mat = load_matrix(obi_path)
|
||||
|
||||
if ref_labels != obi_labels:
|
||||
only_ref = sorted(set(ref_labels) - set(obi_labels))
|
||||
only_obi = sorted(set(obi_labels) - set(ref_labels))
|
||||
print(f' [{label}] label mismatch — '
|
||||
f'only_ref={only_ref} only_obi={only_obi}', file=sys.stderr)
|
||||
return {'comparison': label, 'status': 'LABEL_MISMATCH',
|
||||
'max_abs': '', 'mean_abs': '', 'rmse': '', 'n_pairs': ''}
|
||||
|
||||
n = len(ref_labels)
|
||||
# Off-diagonal mask
|
||||
mask = ~np.eye(n, dtype=bool)
|
||||
diff = np.abs(ref_mat[mask] - obi_mat[mask])
|
||||
n_pairs = diff.size
|
||||
|
||||
max_abs = float(diff.max())
|
||||
mean_abs = float(diff.mean())
|
||||
rmse = float(np.sqrt((diff ** 2).mean()))
|
||||
status = 'PASS' if max_abs <= tol else 'FAIL'
|
||||
|
||||
print(f' [{label}] n={n_pairs} '
|
||||
f'max={max_abs:.3e} mean={mean_abs:.3e} rmse={rmse:.3e} {status}',
|
||||
file=sys.stderr)
|
||||
return {
|
||||
'comparison': label,
|
||||
'max_abs': f'{max_abs:.6e}',
|
||||
'mean_abs': f'{mean_abs:.6e}',
|
||||
'rmse': f'{rmse:.6e}',
|
||||
'n_pairs': str(n_pairs),
|
||||
'status': status,
|
||||
}
|
||||
|
||||
|
||||
# ── comparison table ──────────────────────────────────────────────────────────
|
||||
|
||||
# (label, ref_csv, obikmer_csv)
|
||||
# The reference jaccard/shared is presence-based, which should match both
|
||||
# presence/jaccard and count/jaccard (threshold=1).
|
||||
COMPARISONS = [
|
||||
# ── presence index ────────────────────────────────────────────────────────
|
||||
('presence/jaccard_dist',
|
||||
'reference_dist/jaccard_dist.csv',
|
||||
'obikmer_dist/presence/jaccard_dist.csv'),
|
||||
|
||||
('presence/jaccard_shared',
|
||||
'reference_dist/shared_kmers.csv',
|
||||
'obikmer_dist/presence/jaccard_shared.csv'),
|
||||
|
||||
('presence/hamming_dist',
|
||||
'reference_dist/hamming_dist.csv',
|
||||
'obikmer_dist/presence/hamming_dist.csv'),
|
||||
|
||||
# ── count index (jaccard cross-check) ─────────────────────────────────────
|
||||
('count/jaccard_dist',
|
||||
'reference_dist/jaccard_dist.csv',
|
||||
'obikmer_dist/count/jaccard_dist.csv'),
|
||||
|
||||
('count/jaccard_shared',
|
||||
'reference_dist/shared_kmers.csv',
|
||||
'obikmer_dist/count/jaccard_shared.csv'),
|
||||
|
||||
# ── count index (count-based metrics) ────────────────────────────────────
|
||||
('count/bray_curtis_dist',
|
||||
'reference_dist/bray_curtis_dist.csv',
|
||||
'obikmer_dist/count/bray_curtis_dist.csv'),
|
||||
|
||||
('count/relfreq_bray_curtis_dist',
|
||||
'reference_dist/relfreq_bray_curtis_dist.csv',
|
||||
'obikmer_dist/count/relfreq_bray_curtis_dist.csv'),
|
||||
|
||||
('count/euclidean_dist',
|
||||
'reference_dist/euclidean_dist.csv',
|
||||
'obikmer_dist/count/euclidean_dist.csv'),
|
||||
|
||||
('count/relfreq_euclidean_dist',
|
||||
'reference_dist/relfreq_euclidean_dist.csv',
|
||||
'obikmer_dist/count/relfreq_euclidean_dist.csv'),
|
||||
|
||||
('count/hellinger_dist',
|
||||
'reference_dist/hellinger_dist.csv',
|
||||
'obikmer_dist/count/hellinger_dist.csv'),
|
||||
|
||||
('count/hellinger_euclidean_dist',
|
||||
'reference_dist/hellinger_euclidean_dist.csv',
|
||||
'obikmer_dist/count/hellinger_euclidean_dist.csv'),
|
||||
]
|
||||
|
||||
|
||||
# ── main ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
ap.add_argument('--tol', type=float, default=1e-4,
|
||||
help='Max abs diff threshold for PASS/FAIL (default 1e-4)')
|
||||
ap.add_argument('--out', default='stats/dist_comparison/summary.csv',
|
||||
help='Output summary CSV path')
|
||||
args = ap.parse_args()
|
||||
|
||||
out_path = Path(args.out)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f'Comparing {len(COMPARISONS)} matrix pairs…', file=sys.stderr)
|
||||
rows = []
|
||||
for label, ref, obi in COMPARISONS:
|
||||
rows.append(compare(label, Path(ref), Path(obi), tol=args.tol))
|
||||
|
||||
fields = ['comparison', 'max_abs', 'mean_abs', 'rmse', 'n_pairs', 'status']
|
||||
with out_path.open('w', newline='') as fh:
|
||||
w = csv.DictWriter(fh, fieldnames=fields)
|
||||
w.writeheader()
|
||||
w.writerows(rows)
|
||||
|
||||
print(f'\n→ {out_path}', file=sys.stderr)
|
||||
|
||||
n_fail = sum(1 for r in rows if r.get('status') == 'FAIL')
|
||||
n_pass = sum(1 for r in rows if r.get('status') == 'PASS')
|
||||
print(f'Summary: {n_pass} PASS {n_fail} FAIL '
|
||||
f'{len(rows) - n_pass - n_fail} SKIP', file=sys.stderr)
|
||||
if n_fail:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,21 @@
|
||||
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||
Acidobacterium_capsulatum--ATCC_51196,1.000000,0.000000,0.999981,0.999990,0.999989,0.999987,0.999987,0.999990,0.999988,0.999988,0.999994,0.999989,1.000000,0.999988,0.999987,0.999987,0.999988,0.999989,0.999991,0.999987
|
||||
Bacillus_subtilis--168,1.000000,0.999981,0.000000,0.999990,0.999989,0.999989,0.999989,0.999989,0.999988,0.999986,0.999995,0.999985,0.999999,0.999988,0.999987,0.999989,0.999988,0.999778,0.999993,0.999987
|
||||
Escherichia_coli--CFT073,1.000000,0.999990,0.999990,0.000000,0.825741,0.807495,0.807218,0.991156,0.996855,0.997849,0.999996,0.999633,1.000000,0.993885,0.996736,0.994148,0.993821,0.999991,0.999984,0.999291
|
||||
Escherichia_coli--EDL933,1.000000,0.999989,0.999989,0.825741,0.000000,0.735107,0.734775,0.996126,0.998058,0.997908,0.999997,0.999640,1.000000,0.993993,0.997126,0.994390,0.994059,0.999991,0.999986,0.999292
|
||||
Escherichia_coli--K-12_MG1655,1.000000,0.999987,0.999989,0.807495,0.735107,0.000000,0.382567,0.996190,0.997747,0.997455,0.999996,0.999604,1.000000,0.993444,0.996645,0.993773,0.993431,0.999989,0.999984,0.999174
|
||||
Escherichia_coli--K-12_W3110,1.000000,0.999987,0.999989,0.807218,0.734775,0.382567,0.000000,0.996220,0.997761,0.997467,0.999995,0.999604,1.000000,0.993445,0.996669,0.993769,0.993443,0.999990,0.999985,0.999165
|
||||
Klebsiella_pneumoniae--ATCC_13883,1.000000,0.999990,0.999989,0.991156,0.996126,0.996190,0.996220,0.000000,0.845220,0.840545,0.999997,0.999648,1.000000,0.996177,0.998128,0.996268,0.996052,0.999990,0.999987,0.999325
|
||||
Klebsiella_pneumoniae--HS11286,1.000000,0.999988,0.999988,0.996855,0.998058,0.997747,0.997761,0.845220,0.000000,0.906475,0.999996,0.999683,1.000000,0.997724,0.995697,0.997776,0.997769,0.999989,0.999979,0.999463
|
||||
Klebsiella_pneumoniae--MGH_78578,1.000000,0.999988,0.999986,0.997849,0.997908,0.997455,0.997467,0.840545,0.906475,0.000000,0.999996,0.999704,1.000000,0.997928,0.995054,0.997844,0.997868,0.999990,0.999980,0.999479
|
||||
Opitutus_terrae--PB90-1,1.000000,0.999994,0.999995,0.999996,0.999997,0.999996,0.999995,0.999997,0.999996,0.999996,0.000000,0.999997,0.999998,0.999996,0.999996,0.999996,0.999995,0.999997,0.999993,0.999996
|
||||
Proteus_mirabilis--HI4320,1.000000,0.999989,0.999985,0.999633,0.999640,0.999604,0.999604,0.999648,0.999683,0.999704,0.999997,0.000000,1.000000,0.999604,0.999699,0.999622,0.999613,0.999987,0.999983,0.999505
|
||||
Saccharolobus_islandicus--M.16.4,1.000000,1.000000,0.999999,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.999998,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
|
||||
Salmonella_enterica--AKU_12601,1.000000,0.999988,0.999988,0.993885,0.993993,0.993444,0.993445,0.996177,0.997724,0.997928,0.999996,0.999604,1.000000,0.000000,0.869238,0.682277,0.663383,0.999990,0.999985,0.999260
|
||||
Salmonella_enterica--CT18,1.000000,0.999987,0.999987,0.996736,0.997126,0.996645,0.996669,0.998128,0.995697,0.995054,0.999996,0.999699,1.000000,0.869238,0.000000,0.890872,0.886148,0.999988,0.999976,0.999524
|
||||
Salmonella_enterica--LT2,1.000000,0.999987,0.999989,0.994148,0.994390,0.993773,0.993769,0.996268,0.997776,0.997844,0.999996,0.999622,1.000000,0.682277,0.890872,0.000000,0.622606,0.999989,0.999985,0.999296
|
||||
Salmonella_enterica--P125109,1.000000,0.999988,0.999988,0.993821,0.994059,0.993431,0.993443,0.996052,0.997769,0.997868,0.999995,0.999613,1.000000,0.663383,0.886148,0.622606,0.000000,0.999988,0.999983,0.999270
|
||||
Shouchella_clausii--KSM-K16,1.000000,0.999989,0.999778,0.999991,0.999991,0.999989,0.999990,0.999990,0.999989,0.999990,0.999997,0.999987,1.000000,0.999990,0.999988,0.999989,0.999988,0.000000,0.999991,0.999988
|
||||
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,1.000000,0.999991,0.999993,0.999984,0.999986,0.999984,0.999985,0.999987,0.999979,0.999980,0.999993,0.999983,1.000000,0.999985,0.999976,0.999985,0.999983,0.999991,0.000000,0.999983
|
||||
Yersinia_ruckeri--YRB,1.000000,0.999987,0.999987,0.999291,0.999292,0.999174,0.999165,0.999325,0.999463,0.999479,0.999996,0.999505,1.000000,0.999260,0.999524,0.999296,0.999270,0.999988,0.999983,0.000000
|
||||
|
@@ -0,0 +1 @@
|
||||
(((((((((((Candidozyma_auris--GCF_003013715.1_ASM301371v2:0.5000001881725941,Saccharolobus_islandicus--M.16.4:0.4999993211600824):0.0000023411501775538747,Opitutus_terrae--PB90-1:0.499997075187947):0.0000029791191795691675,(Acidobacterium_capsulatum--ATCC_51196:0.49999227771334689,(Bacillus_subtilis--168:0.49988797935621456,Shouchella_clausii--KSM-K16:0.49988984146059159):0.0001037210285571577):0.0000023959836053522034):0.0000034093646568700288,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1:0.4999920159222422):0.000199555100890203,Proteus_mirabilis--HI4320:0.49979129185300427):0.00010103619067070024,Yersinia_ruckeri--YRB:0.4996806650749249):0.0013719139155004,(Klebsiella_pneumoniae--HS11286:0.43798845051648258,(Klebsiella_pneumoniae--ATCC_13883:0.41780293826821265,Klebsiella_pneumoniae--MGH_78578:0.42274184870836559):0.017586732339732737):0.0604124197073832):0.0006482538063555254,(Salmonella_enterica--CT18:0.43952894448143017,(Salmonella_enterica--AKU_12601:0.3357977326267918,(Salmonella_enterica--LT2:0.31203395843666389,Salmonella_enterica--P125109:0.31057217324861216):0.025729515856701136):0.10292985918524672):0.05825411485542886):0.08937928015651564,Escherichia_coli--CFT073:0.40806501650701029):0.0410131211869626,Escherichia_coli--EDL933:0.3681464750911808):0.1755112579711463,Escherichia_coli--K-12_MG1655:0.19129818036662728,Escherichia_coli--K-12_W3110:0.19126872019906239);
|
||||
@@ -0,0 +1,21 @@
|
||||
genome,Candidozyma_auris--GCF_003013715.1_ASM301371v2,Acidobacterium_capsulatum--ATCC_51196,Bacillus_subtilis--168,Escherichia_coli--CFT073,Escherichia_coli--EDL933,Escherichia_coli--K-12_MG1655,Escherichia_coli--K-12_W3110,Klebsiella_pneumoniae--ATCC_13883,Klebsiella_pneumoniae--HS11286,Klebsiella_pneumoniae--MGH_78578,Opitutus_terrae--PB90-1,Proteus_mirabilis--HI4320,Saccharolobus_islandicus--M.16.4,Salmonella_enterica--AKU_12601,Salmonella_enterica--CT18,Salmonella_enterica--LT2,Salmonella_enterica--P125109,Shouchella_clausii--KSM-K16,Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,Yersinia_ruckeri--YRB
|
||||
Candidozyma_auris--GCF_003013715.1_ASM301371v2,0,0,0,0,0,0,0,0,0,0,0,0,8,0,1,0,0,0,0,3
|
||||
Acidobacterium_capsulatum--ATCC_51196,0,0,203,119,128,141,140,116,109,111,78,112,0,136,109,147,134,117,55,129
|
||||
Bacillus_subtilis--168,0,203,0,124,132,128,123,133,109,130,66,158,6,131,112,124,135,2393,46,124
|
||||
Escherichia_coli--CFT073,0,119,124,0,1966777,1998059,1999094,117743,32029,22312,63,4225,0,74946,31918,73311,76585,113,128,7854
|
||||
Escherichia_coli--EDL933,0,128,132,1966777,0,2627885,2628700,52488,20134,22064,48,4202,0,74655,28602,71244,74665,112,108,7963
|
||||
Escherichia_coli--K-12_MG1655,0,141,128,1998059,2627885,0,4452541,48302,21382,24602,47,4277,0,75729,30449,73622,76778,119,111,8566
|
||||
Escherichia_coli--K-12_W3110,0,140,123,1999094,2628700,4452541,0,47894,21226,24470,68,4278,0,75658,30207,73614,76583,112,108,8660
|
||||
Klebsiella_pneumoniae--ATCC_13883,0,116,133,117743,52488,48302,47894,0,1416091,1477759,42,4172,0,48296,18988,48144,50416,120,106,7712
|
||||
Klebsiella_pneumoniae--HS11286,0,109,109,32029,20134,21382,21226,1416091,0,644063,42,2738,0,21498,29758,21606,21376,99,102,4417
|
||||
Klebsiella_pneumoniae--MGH_78578,0,111,130,22312,22064,24602,24470,1477759,644063,0,42,2614,0,19948,35067,21330,20813,97,102,4374
|
||||
Opitutus_terrae--PB90-1,0,78,66,63,48,47,68,42,42,42,0,43,18,57,42,53,66,39,58,43
|
||||
Proteus_mirabilis--HI4320,0,112,158,4225,4202,4277,4278,4172,2738,2614,43,0,0,4254,2481,4166,4215,131,103,4704
|
||||
Saccharolobus_islandicus--M.16.4,8,0,6,0,0,0,0,0,0,0,18,0,0,0,0,0,0,0,0,0
|
||||
Salmonella_enterica--AKU_12601,0,136,131,74946,74655,75729,75658,48296,21498,19948,57,4254,0,0,1047731,2857146,2951421,117,108,7643
|
||||
Salmonella_enterica--CT18,1,109,112,31918,28602,30449,30207,18988,29758,35067,42,2481,0,1047731,0,917948,940297,106,106,3716
|
||||
Salmonella_enterica--LT2,0,147,124,73311,71244,73622,73614,48144,21606,21330,53,4166,0,2857146,917948,0,3284800,122,108,7460
|
||||
Salmonella_enterica--P125109,0,134,135,76585,74665,76778,76583,50416,21376,20813,66,4215,0,2951421,940297,3284800,0,134,124,7645
|
||||
Shouchella_clausii--KSM-K16,0,117,2393,113,112,119,112,120,99,97,39,131,0,117,106,122,134,0,58,124
|
||||
Wolbachia_endosymbiont--GCF_000306885.1_ASM30688v1,0,55,46,128,108,111,108,106,102,102,58,103,0,108,106,108,124,58,0,96
|
||||
Yersinia_ruckeri--YRB,3,129,124,7854,7963,8566,8660,7712,4417,4374,43,4704,0,7643,3716,7460,7645,124,96,0
|
||||
|
Reference in New Issue
Block a user