#!/usr/bin/env python3 """Parse obikmer merge debug log → Markdown performance report.""" import re import sys from datetime import datetime from collections import defaultdict from statistics import mean, median, stdev ANSI = re.compile(r'\x1b\[[0-9;]*m') def strip(s): return ANSI.sub('', s) def parse_ts(s): return datetime.fromisoformat(s.replace('Z', '+00:00')) def dur_s(s): s = s.strip() if s.endswith('ms'): return float(s[:-2]) / 1e3 if s.endswith('µs'): return float(s[:-2]) / 1e6 if s.endswith('us'): return float(s[:-2]) / 1e6 if s.endswith('ns'): return float(s[:-2]) / 1e9 if s.endswith('s'): return float(s[:-1]) return float(s) def fmt_s(s): if s < 0.001: return f"{s*1e6:.0f}µs" if s < 1: return f"{s*1e3:.0f}ms" if s < 60: return f"{s:.2f}s" return f"{s/60:.1f}min ({s:.0f}s)" def fmt_rate(n, s): if s <= 0: return "—" r = n / s if r >= 1e9: return f"{r/1e9:.2f}G/s" if r >= 1e6: return f"{r/1e6:.2f}M/s" if r >= 1e3: return f"{r/1e3:.2f}K/s" return f"{r:.0f}/s" def pct(a, b): return f"{100*a/b:.1f}%" if b else "—" def stats_row(label, values, unit="s", fmt=fmt_s): if not values: return f"| {label} | — | — | — | — | — |" mn, mx, med, av = min(values), max(values), median(values), mean(values) sd = stdev(values) if len(values) > 1 else 0 return f"| {label} | {fmt(mn)} | {fmt(med)} | {fmt(av)} | {fmt(mx)} | {fmt(sd)} |" # ── patterns ────────────────────────────────────────────────────────────────── TS = r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z)' pats = { 'graph_done': re.compile(TS + r'.*partition (\d+): de Bruijn graph done — (\d+) new kmers'), 'trav_start': re.compile(TS + r'.*partition (\d+): unitig traversal start — (\d+) nodes'), 'trav_closing': re.compile(TS + r'.*partition (\d+): unitig writer closing'), 'trav_closed': re.compile(TS + r'.*partition (\d+): unitig writer closed'), 'graph_dropped': re.compile(TS + r'.*partition (\d+): graph dropped — starting MPHF build \((\d+) unitigs\)'), 'mphf_done': re.compile(TS + r'.*partition (\d+): MPHF build done'), 'mphf_open': re.compile(TS + r'.*partition (\d+): MPHF open in ([\d.]+)s'), 'bld_ready': re.compile(TS + r'.*partition (\d+): builders ready in ([\d.]+)s'), 'pass2_done': re.compile(TS + r'.*partition (\d+): pass2 pipeline done in ([\d.]+)s'), 'bld_closed': re.compile(TS + r'.*partition (\d+): builders closed in ([\d.]+)s'), 'part_done': re.compile(TS + r'.*partition (\d+): done in ([\d.]+)s — (\d+) new kmers'), 'worker': re.compile(TS + r'.*activated worker (\d+).*efficiency (\d+)%.*gain vs prev (\d+)%'), 'worker_poll': re.compile(TS + r'.*activated worker (\d+) \(poll\).*efficiency (\d+)%'), 'compute_deg': re.compile(TS + r'.*partition (\d+): compute_degrees in ([\d.]+)s — (\d+) nodes'), 'stage_done': re.compile(TS + r'.*done stage=merge_partitions wall_secs=([\d.]+)'), 'workers_rep': re.compile(r'workers spawned: (\d+) / (\d+)'), } # ── parse ───────────────────────────────────────────────────────────────────── P = defaultdict(dict) # partition_id → timing dict workers_ev = [] wall_total = None workers_final = (None, None) with open(sys.argv[1]) as f: for raw in f: line = strip(raw) m = pats['graph_done'].search(line) if m: pid = int(m.group(2)) P[pid]['n_kmers'] = int(m.group(3)) P[pid]['graph_done_ts'] = parse_ts(m.group(1)) continue m = pats['trav_start'].search(line) if m: pid = int(m.group(2)) P[pid]['trav_start_ts'] = parse_ts(m.group(1)) P[pid]['n_nodes'] = int(m.group(3)) continue m = pats['trav_closing'].search(line) if m: pid = int(m.group(2)) P[pid]['trav_closing_ts'] = parse_ts(m.group(1)) continue m = pats['trav_closed'].search(line) if m: pid = int(m.group(2)) P[pid]['trav_closed_ts'] = parse_ts(m.group(1)) continue m = pats['graph_dropped'].search(line) if m: pid = int(m.group(2)) P[pid]['drop_ts'] = parse_ts(m.group(1)) P[pid]['n_unitigs'] = int(m.group(3)) continue m = pats['mphf_done'].search(line) if m: pid = int(m.group(2)) P[pid]['mphf_done_ts'] = parse_ts(m.group(1)) continue m = pats['mphf_open'].search(line) if m: pid = int(m.group(2)) P[pid]['mphf_open_s'] = float(m.group(3)) continue m = pats['bld_ready'].search(line) if m: pid = int(m.group(2)) P[pid]['bld_ready_s'] = float(m.group(3)) continue m = pats['pass2_done'].search(line) if m: pid = int(m.group(2)) P[pid]['pass2_s'] = float(m.group(3)) continue m = pats['bld_closed'].search(line) if m: pid = int(m.group(2)) P[pid]['bld_closed_s'] = float(m.group(3)) continue m = pats['part_done'].search(line) if m: pid = int(m.group(2)) P[pid]['total_s'] = float(m.group(3)) P[pid]['done_ts'] = parse_ts(m.group(1)) continue m = pats['worker'].search(line) if m: workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)), 'gain': int(m.group(4)), 'ts': parse_ts(m.group(1)), 'poll': False}) continue m = pats['worker_poll'].search(line) if m: workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)), 'gain': None, 'ts': parse_ts(m.group(1)), 'poll': True}) continue m = pats['compute_deg'].search(line) if m: pid = int(m.group(2)) P[pid]['cdeg_s'] = float(m.group(3)) P[pid]['n_nodes'] = P[pid].get('n_nodes') or int(m.group(4)) continue m = pats['stage_done'].search(line) if m: wall_total = float(m.group(2)) continue m = pats['workers_rep'].search(line) if m: workers_final = (int(m.group(1)), int(m.group(2))) continue # ── derive per-partition phases ─────────────────────────────────────────────── def tsdiff(p, k1, k2): if k1 in p and k2 in p: return (p[k2] - p[k1]).total_seconds() return None phases = {} for pid, p in P.items(): row = {'pid': pid} row['n_kmers'] = p.get('n_kmers', 0) row['n_nodes'] = p.get('n_nodes', 0) row['n_unitigs']= p.get('n_unitigs', 0) row['total_s'] = p.get('total_s') row['cdeg_s'] = p.get('cdeg_s') row['mphf_open_s'] = p.get('mphf_open_s') row['bld_ready_s'] = p.get('bld_ready_s') row['pass2_s'] = p.get('pass2_s') row['bld_closed_s'] = p.get('bld_closed_s') # Traversal: trav_start → trav_closing (= writing all unitigs) row['trav_s'] = tsdiff(p, 'trav_start_ts', 'trav_closing_ts') # Writer close: trav_closing → trav_closed row['close_s'] = tsdiff(p, 'trav_closing_ts', 'trav_closed_ts') # Graph drop: trav_closed → drop_ts row['drop_s'] = tsdiff(p, 'trav_closed_ts', 'drop_ts') # MPHF build: drop_ts → mphf_done_ts row['mphf_s'] = tsdiff(p, 'drop_ts', 'mphf_done_ts') # After MPHF: mphf_done → done_ts row['post_s'] = tsdiff(p, 'mphf_done_ts', 'done_ts') # Graph build: total - known phases (rough estimate) known = sum(v for v in [row['cdeg_s'], row['trav_s'], row['close_s'], row['drop_s'], row['mphf_s'], row['mphf_open_s'], row['bld_ready_s'], row['pass2_s'], row['bld_closed_s']] if v is not None) row['graph_build_s'] = (row['total_s'] - known) if row['total_s'] else None phases[pid] = row # helpers def collect(key): return [r[key] for r in phases.values() if r.get(key) is not None] def rate_stats(n_key, t_key): """Returns list of throughput values (items/s).""" result = [] for r in phases.values(): n, t = r.get(n_key), r.get(t_key) if n and t and t > 0: result.append(n / t) return result # ── output ──────────────────────────────────────────────────────────────────── out = [] w = out.append w("# obikmer merge — performance report\n") # Run info n_parts = len([r for r in phases.values() if r['n_kmers'] > 0]) n_empty = len([r for r in phases.values() if r['n_kmers'] == 0]) total_kmers = sum(r['n_kmers'] for r in phases.values()) w("## Run summary\n") w(f"- **Partitions**: {len(phases)} total — {n_parts} non-empty, {n_empty} empty") w(f"- **New kmers (total)**: {total_kmers:,}") if wall_total: w(f"- **merge_partitions wall time**: {fmt_s(wall_total)}") if workers_final[0]: w(f"- **Workers spawned**: {workers_final[0]} / {workers_final[1]} (max)") w("") # Worker spawn timeline if workers_ev: w("## Worker activation\n") w("| Time | Worker # | Trigger | Efficiency | Gain vs prev |") w("|------|----------|---------|------------|--------------|") t0 = workers_ev[0]['ts'] for e in workers_ev: elapsed = fmt_s((e['ts'] - t0).total_seconds()) trigger = "poll (timeout)" if e['poll'] else "partition done" gain = f"{e['gain']}%" if e.get('gain') is not None else "—" w(f"| +{elapsed} | {e['n']} | {trigger} | {e['eff']}% | {gain} |") w("") # Phase breakdown table w("## Phase timing statistics\n") w("Columns: min | median | mean | max | stdev\n") w("| Phase | min | median | mean | max | stdev |") w("|-------|-----|--------|------|-----|-------|") w(stats_row("Graph build (estimated)", collect('graph_build_s'))) w(stats_row("compute_degrees", collect('cdeg_s'))) w(stats_row("Unitig traversal", collect('trav_s'))) w(stats_row("Writer close (uw.close)", collect('close_s'))) w(stats_row("Graph drop", collect('drop_s'))) w(stats_row("MPHF build", collect('mphf_s'))) w(stats_row("MPHF open", collect('mphf_open_s'))) w(stats_row("Builders ready", collect('bld_ready_s'))) w(stats_row("Pass2 pipeline", collect('pass2_s'))) w(stats_row("Builders close", collect('bld_closed_s'))) w(stats_row("Post-MPHF (residual)", collect('post_s'))) w(stats_row("**Total per partition**", collect('total_s'))) w("") # Throughput w("## Throughput by phase\n") w("| Phase | metric | min | median | mean | max |") w("|-------|--------|-----|--------|------|-----|") def rate_row(label, rates): if not rates: return f"| {label} | — | — | — | — | — |" f = lambda x: fmt_rate(x, 1) mn, med, av, mx = min(rates), median(rates), mean(rates), max(rates) return f"| {label} | nodes/s | {f(mn)} | {f(med)} | {f(av)} | {f(mx)} |" w(rate_row("compute_degrees", rate_stats('n_nodes', 'cdeg_s'))) w(rate_row("Unitig traversal", rate_stats('n_nodes', 'trav_s'))) w(rate_row("MPHF build", rate_stats('n_unitigs', 'mphf_s'))) w("") # Top 10 slowest partitions w("## Top 10 slowest partitions\n") w("| Partition | nodes | unitigs | total | trav | MPHF | graph build |") w("|-----------|-------|---------|-------|------|------|-------------|") sorted_parts = sorted(phases.values(), key=lambda r: r['total_s'] or 0, reverse=True) for r in sorted_parts[:10]: pid = r['pid'] def f(k): return fmt_s(r[k]) if r.get(k) is not None else "—" nodes = f"{r['n_nodes']/1e6:.1f}M" if r['n_nodes'] else "—" unitigs = f"{r['n_unitigs']/1e6:.1f}M" if r['n_unitigs'] else "—" w(f"| {pid} | {nodes} | {unitigs} | {f('total_s')} | {f('trav_s')} | {f('mphf_s')} | {f('graph_build_s')} |") w("") # Phase share of total time (for non-empty partitions with full data) complete = [r for r in phases.values() if all(r.get(k) is not None for k in ('total_s','trav_s','close_s','drop_s','mphf_s', 'mphf_open_s','bld_ready_s','pass2_s','bld_closed_s')) and r['total_s'] and r['total_s'] > 0] if complete: w("## Phase share of total time (mean across complete partitions)\n") total_mean = mean(r['total_s'] for r in complete) w(f"_Based on {len(complete)} partitions with full timing data. Mean total: {fmt_s(total_mean)}_\n") w("| Phase | mean time | share |") w("|-------|-----------|-------|") for label, key in [ ("Graph build", 'graph_build_s'), ("compute_degrees", 'cdeg_s'), ("Unitig traversal", 'trav_s'), ("Writer close", 'close_s'), ("Graph drop", 'drop_s'), ("MPHF build", 'mphf_s'), ("MPHF open", 'mphf_open_s'), ("Builders ready", 'bld_ready_s'), ("Pass2 pipeline", 'pass2_s'), ("Builders close", 'bld_closed_s'), ("Post-MPHF (residual)", 'post_s'), ]: vals = [r[key] for r in complete] m = mean(vals) w(f"| {label} | {fmt_s(m)} | {pct(m, total_mean)} |") w("") print('\n'.join(out))