#!/usr/bin/env python3 """Compute reference pairwise distance matrices from per-specimen .npz kmer indexes. Reads all .npz files in reference_index/ (each containing sorted uint64 `kmers` and uint32 `counts`), computes all distance metrics supported by `obikmer distance`, and writes one CSV per metric to reference_dist/. Output CSV format matches `obikmer distance --output`: - first row: "genome", then specimen names - subsequent rows: specimen name, then float or int values Metrics written jaccard_dist.csv Jaccard distance (presence/absence) shared_kmers.csv Shared-kmer count matrix (intersection size) bray_curtis_dist.csv Bray-Curtis dissimilarity (raw counts) relfreq_bray_curtis_dist.csv Bray-Curtis on relative frequencies euclidean_dist.csv Euclidean distance (raw counts) relfreq_euclidean_dist.csv Euclidean distance on relative frequencies hellinger_dist.csv Hellinger distance hellinger_euclidean_dist.csv Euclidean distance in Hellinger space """ import argparse import sys from pathlib import Path import numpy as np # ── pairwise helpers ────────────────────────────────────────────────────────── def shared_indices(a_kmers: np.ndarray, b_kmers: np.ndarray): """Return index arrays (idx_a, idx_b) for kmers present in both sets. Both arrays must be sorted uint64. Uses searchsorted: O(|B| log |A|). """ pos = np.searchsorted(a_kmers, b_kmers) pos = np.clip(pos, 0, len(a_kmers) - 1) mask = a_kmers[pos] == b_kmers idx_b = np.where(mask)[0] idx_a = pos[idx_b] return idx_a, idx_b def pairwise_stats(specimens: list[dict]) -> dict[str, np.ndarray]: """Compute all pairwise distance matrices at once. Returns a dict metric_name → ndarray (n×n float64 or int64). Each specimen dict has keys: name, kmers, counts. """ n = len(specimens) # Pre-compute per-specimen scalars kmer_counts = np.array([len(s['kmers']) for s in specimens], dtype=np.uint64) count_sums = np.array([s['counts'].sum() for s in specimens], dtype=np.uint64) # Per-specimen sum-of-squares (for Euclidean decomposition) sq_sums = np.array([(s['counts'].astype(np.float64) ** 2).sum() for s in specimens]) # Allocate output matrices shared_mat = np.zeros((n, n), dtype=np.uint64) hamming_mat = np.zeros((n, n), dtype=np.float64) jaccard_mat = np.zeros((n, n), dtype=np.float64) bray_mat = np.zeros((n, n), dtype=np.float64) relfreq_bray = np.zeros((n, n), dtype=np.float64) euclidean_mat = np.zeros((n, n), dtype=np.float64) relfreq_eucl = np.zeros((n, n), dtype=np.float64) hellinger_mat = np.zeros((n, n), dtype=np.float64) hell_eucl_mat = np.zeros((n, n), dtype=np.float64) for i in range(n): a_km = specimens[i]['kmers'] a_ct = specimens[i]['counts'].astype(np.float64) sa = float(count_sums[i]) na = int(kmer_counts[i]) for j in range(i + 1, n): b_km = specimens[j]['kmers'] b_ct = specimens[j]['counts'].astype(np.float64) sb = float(count_sums[j]) nb = int(kmer_counts[j]) idx_a, idx_b = shared_indices(a_km, b_km) inter = len(idx_a) ca_sh = a_ct[idx_a] cb_sh = b_ct[idx_b] # ── Presence metrics ────────────────────────────────────────────── union = na + nb - inter jac = (1.0 - inter / union) if union else 0.0 hamming = float(na + nb - 2 * inter) # |A Δ B| # ── Count metrics ───────────────────────────────────────────────── # Bray-Curtis: 1 - 2*Σmin(a,b) / (Σa + Σb) sum_min = np.minimum(ca_sh, cb_sh).sum() denom_bc = sa + sb bc = (1.0 - 2.0 * sum_min / denom_bc) if denom_bc else 0.0 # RelfreqBray: 1 - Σmin(a/sa, b/sb) [only shared contribute] if sa and sb: rfb = 1.0 - np.minimum(ca_sh / sa, cb_sh / sb).sum() else: rfb = 0.0 # Euclidean: √(Σa² + Σb² - 2·Σ(a·b)_shared) cross = (ca_sh * cb_sh).sum() eucl_partial = sq_sums[i] + sq_sums[j] - 2.0 * cross eucl = np.sqrt(max(eucl_partial, 0.0)) # RelfreqEuclidean: √(Σ(a/sa - b/sb)²) # = √(Σa²/sa² + Σb²/sb² - 2·Σ(a·b)_shared/(sa·sb)) if sa and sb: rf_cross = (ca_sh / sa * (cb_sh / sb)).sum() rfe_partial = (sq_sums[i] / sa**2 + sq_sums[j] / sb**2 - 2.0 * rf_cross) rfe = np.sqrt(max(rfe_partial, 0.0)) else: rfe = 0.0 # Hellinger partial: Σ(√(a/sa) - √(b/sb))² over global universe # = 2 - 2·Σ√(a·b)_shared / √(sa·sb) if sa and sb: bc_coeff = np.sqrt(ca_sh * cb_sh).sum() / np.sqrt(sa * sb) hell_partial = max(2.0 - 2.0 * bc_coeff, 0.0) else: hell_partial = 0.0 sq2 = np.sqrt(2.0) hell = np.sqrt(hell_partial) / sq2 hell_euc = np.sqrt(hell_partial) # ── Fill symmetric matrices ─────────────────────────────────────── for mat, val in [ (shared_mat, inter), (hamming_mat, hamming), (jaccard_mat, jac), (bray_mat, bc), (relfreq_bray, rfb), (euclidean_mat, eucl), (relfreq_eucl, rfe), (hellinger_mat, hell), (hell_eucl_mat, hell_euc), ]: mat[i, j] = val mat[j, i] = val return { 'shared_kmers': shared_mat, 'hamming_dist': hamming_mat, 'jaccard_dist': jaccard_mat, 'bray_curtis_dist': bray_mat, 'relfreq_bray_curtis_dist': relfreq_bray, 'euclidean_dist': euclidean_mat, 'relfreq_euclidean_dist': relfreq_eucl, 'hellinger_dist': hellinger_mat, 'hellinger_euclidean_dist': hell_eucl_mat, } # ── I/O ─────────────────────────────────────────────────────────────────────── def write_csv(path: Path, labels: list[str], mat: np.ndarray, fmt: str) -> None: with path.open('w') as fh: fh.write('genome,' + ','.join(labels) + '\n') for i, label in enumerate(labels): row = ','.join(format(mat[i, j], fmt) for j in range(len(labels))) fh.write(f'{label},{row}\n') print(f' → {path}', file=sys.stderr) # ── main ───────────────────────────────────────────────────────────────────── def main() -> None: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument('--ref-dir', default='reference_index', help='Directory with per-specimen .npz files (default: reference_index)') ap.add_argument('--out-dir', default='reference_dist', help='Output directory for CSV files (default: reference_dist)') args = ap.parse_args() ref_dir = Path(args.ref_dir) out_dir = Path(args.out_dir) out_dir.mkdir(exist_ok=True) npz_files = sorted(ref_dir.glob('*.npz')) if not npz_files: print(f'ERROR: no .npz files found in {ref_dir}', file=sys.stderr) sys.exit(1) print(f'Loading {len(npz_files)} specimen(s) from {ref_dir}/', file=sys.stderr) specimens = [] for f in npz_files: data = np.load(f) specimens.append({ 'name': f.stem, 'kmers': data['kmers'], 'counts': data['counts'], }) print(f' {f.stem}: {len(data["kmers"]):,} kmers', file=sys.stderr) labels = [s['name'] for s in specimens] n = len(labels) print(f'\nComputing pairwise distances for {n} specimens…', file=sys.stderr) matrices = pairwise_stats(specimens) print(f'\nWriting CSVs to {out_dir}/', file=sys.stderr) write_csv(out_dir / 'shared_kmers.csv', labels, matrices['shared_kmers'], 'd') write_csv(out_dir / 'hamming_dist.csv', labels, matrices['hamming_dist'], '.6f') write_csv(out_dir / 'jaccard_dist.csv', labels, matrices['jaccard_dist'], '.6f') write_csv(out_dir / 'bray_curtis_dist.csv', labels, matrices['bray_curtis_dist'], '.6f') write_csv(out_dir / 'relfreq_bray_curtis_dist.csv', labels, matrices['relfreq_bray_curtis_dist'], '.6f') write_csv(out_dir / 'euclidean_dist.csv', labels, matrices['euclidean_dist'], '.6f') write_csv(out_dir / 'relfreq_euclidean_dist.csv', labels, matrices['relfreq_euclidean_dist'], '.6f') write_csv(out_dir / 'hellinger_dist.csv', labels, matrices['hellinger_dist'], '.6f') write_csv(out_dir / 'hellinger_euclidean_dist.csv', labels, matrices['hellinger_euclidean_dist'], '.6f') print('\nDone.', file=sys.stderr) if __name__ == '__main__': main()