Files
obikmer/scripts/merge_report.py
T
Eric Coissac 6d85387077 feat: add performance instrumentation and dynamic worker scaling
This change enhances observability and adaptability in the merge pipeline. Performance timing and debug logging are added to the De Bruijn graph and partition merge layers to track phase durations and pipeline metrics. The merge module replaces blocking receives with timed polls to sample CPU efficiency, dynamically spawning workers when utilization drops below a threshold. A new script is also introduced to parse merge debug logs and generate structured Markdown reports detailing throughput, phase breakdowns, and partition performance.
2026-06-13 13:21:53 +02:00

348 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""Parse obikmer merge debug log → Markdown performance report."""
import re
import sys
from datetime import datetime
from collections import defaultdict
from statistics import mean, median, stdev
ANSI = re.compile(r'\x1b\[[0-9;]*m')
def strip(s):
return ANSI.sub('', s)
def parse_ts(s):
return datetime.fromisoformat(s.replace('Z', '+00:00'))
def dur_s(s):
s = s.strip()
if s.endswith('ms'): return float(s[:-2]) / 1e3
if s.endswith('µs'): return float(s[:-2]) / 1e6
if s.endswith('us'): return float(s[:-2]) / 1e6
if s.endswith('ns'): return float(s[:-2]) / 1e9
if s.endswith('s'): return float(s[:-1])
return float(s)
def fmt_s(s):
if s < 0.001: return f"{s*1e6:.0f}µs"
if s < 1: return f"{s*1e3:.0f}ms"
if s < 60: return f"{s:.2f}s"
return f"{s/60:.1f}min ({s:.0f}s)"
def fmt_rate(n, s):
if s <= 0: return "—"
r = n / s
if r >= 1e9: return f"{r/1e9:.2f}G/s"
if r >= 1e6: return f"{r/1e6:.2f}M/s"
if r >= 1e3: return f"{r/1e3:.2f}K/s"
return f"{r:.0f}/s"
def pct(a, b):
return f"{100*a/b:.1f}%" if b else "—"
def stats_row(label, values, unit="s", fmt=fmt_s):
if not values: return f"| {label} | — | — | — | — | — |"
mn, mx, med, av = min(values), max(values), median(values), mean(values)
sd = stdev(values) if len(values) > 1 else 0
return f"| {label} | {fmt(mn)} | {fmt(med)} | {fmt(av)} | {fmt(mx)} | {fmt(sd)} |"
# ── patterns ──────────────────────────────────────────────────────────────────
TS = r'(\d{4}-\d{2}-\d{2}T[\d:.]+Z)'
pats = {
'graph_done': re.compile(TS + r'.*partition (\d+): de Bruijn graph done — (\d+) new kmers'),
'trav_start': re.compile(TS + r'.*partition (\d+): unitig traversal start — (\d+) nodes'),
'trav_closing': re.compile(TS + r'.*partition (\d+): unitig writer closing'),
'trav_closed': re.compile(TS + r'.*partition (\d+): unitig writer closed'),
'graph_dropped': re.compile(TS + r'.*partition (\d+): graph dropped — starting MPHF build \((\d+) unitigs\)'),
'mphf_done': re.compile(TS + r'.*partition (\d+): MPHF build done'),
'mphf_open': re.compile(TS + r'.*partition (\d+): MPHF open in ([\d.]+)s'),
'bld_ready': re.compile(TS + r'.*partition (\d+): builders ready in ([\d.]+)s'),
'pass2_done': re.compile(TS + r'.*partition (\d+): pass2 pipeline done in ([\d.]+)s'),
'bld_closed': re.compile(TS + r'.*partition (\d+): builders closed in ([\d.]+)s'),
'part_done': re.compile(TS + r'.*partition (\d+): done in ([\d.]+)s — (\d+) new kmers'),
'worker': re.compile(TS + r'.*activated worker (\d+).*efficiency (\d+)%.*gain vs prev (\d+)%'),
'worker_poll': re.compile(TS + r'.*activated worker (\d+) \(poll\).*efficiency (\d+)%'),
'compute_deg': re.compile(TS + r'.*partition (\d+): compute_degrees in ([\d.]+)s — (\d+) nodes'),
'stage_done': re.compile(TS + r'.*done stage=merge_partitions wall_secs=([\d.]+)'),
'workers_rep': re.compile(r'workers spawned: (\d+) / (\d+)'),
}
# ── parse ─────────────────────────────────────────────────────────────────────
P = defaultdict(dict) # partition_id → timing dict
workers_ev = []
wall_total = None
workers_final = (None, None)
with open(sys.argv[1]) as f:
for raw in f:
line = strip(raw)
m = pats['graph_done'].search(line)
if m:
pid = int(m.group(2))
P[pid]['n_kmers'] = int(m.group(3))
P[pid]['graph_done_ts'] = parse_ts(m.group(1))
continue
m = pats['trav_start'].search(line)
if m:
pid = int(m.group(2))
P[pid]['trav_start_ts'] = parse_ts(m.group(1))
P[pid]['n_nodes'] = int(m.group(3))
continue
m = pats['trav_closing'].search(line)
if m:
pid = int(m.group(2))
P[pid]['trav_closing_ts'] = parse_ts(m.group(1))
continue
m = pats['trav_closed'].search(line)
if m:
pid = int(m.group(2))
P[pid]['trav_closed_ts'] = parse_ts(m.group(1))
continue
m = pats['graph_dropped'].search(line)
if m:
pid = int(m.group(2))
P[pid]['drop_ts'] = parse_ts(m.group(1))
P[pid]['n_unitigs'] = int(m.group(3))
continue
m = pats['mphf_done'].search(line)
if m:
pid = int(m.group(2))
P[pid]['mphf_done_ts'] = parse_ts(m.group(1))
continue
m = pats['mphf_open'].search(line)
if m:
pid = int(m.group(2))
P[pid]['mphf_open_s'] = float(m.group(3))
continue
m = pats['bld_ready'].search(line)
if m:
pid = int(m.group(2))
P[pid]['bld_ready_s'] = float(m.group(3))
continue
m = pats['pass2_done'].search(line)
if m:
pid = int(m.group(2))
P[pid]['pass2_s'] = float(m.group(3))
continue
m = pats['bld_closed'].search(line)
if m:
pid = int(m.group(2))
P[pid]['bld_closed_s'] = float(m.group(3))
continue
m = pats['part_done'].search(line)
if m:
pid = int(m.group(2))
P[pid]['total_s'] = float(m.group(3))
P[pid]['done_ts'] = parse_ts(m.group(1))
continue
m = pats['worker'].search(line)
if m:
workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)),
'gain': int(m.group(4)), 'ts': parse_ts(m.group(1)), 'poll': False})
continue
m = pats['worker_poll'].search(line)
if m:
workers_ev.append({'n': int(m.group(2)), 'eff': int(m.group(3)),
'gain': None, 'ts': parse_ts(m.group(1)), 'poll': True})
continue
m = pats['compute_deg'].search(line)
if m:
pid = int(m.group(2))
P[pid]['cdeg_s'] = float(m.group(3))
P[pid]['n_nodes'] = P[pid].get('n_nodes') or int(m.group(4))
continue
m = pats['stage_done'].search(line)
if m:
wall_total = float(m.group(2))
continue
m = pats['workers_rep'].search(line)
if m:
workers_final = (int(m.group(1)), int(m.group(2)))
continue
# ── derive per-partition phases ───────────────────────────────────────────────
def tsdiff(p, k1, k2):
if k1 in p and k2 in p:
return (p[k2] - p[k1]).total_seconds()
return None
phases = {}
for pid, p in P.items():
row = {'pid': pid}
row['n_kmers'] = p.get('n_kmers', 0)
row['n_nodes'] = p.get('n_nodes', 0)
row['n_unitigs']= p.get('n_unitigs', 0)
row['total_s'] = p.get('total_s')
row['cdeg_s'] = p.get('cdeg_s')
row['mphf_open_s'] = p.get('mphf_open_s')
row['bld_ready_s'] = p.get('bld_ready_s')
row['pass2_s'] = p.get('pass2_s')
row['bld_closed_s'] = p.get('bld_closed_s')
# Traversal: trav_start → trav_closing (= writing all unitigs)
row['trav_s'] = tsdiff(p, 'trav_start_ts', 'trav_closing_ts')
# Writer close: trav_closing → trav_closed
row['close_s'] = tsdiff(p, 'trav_closing_ts', 'trav_closed_ts')
# Graph drop: trav_closed → drop_ts
row['drop_s'] = tsdiff(p, 'trav_closed_ts', 'drop_ts')
# MPHF build: drop_ts → mphf_done_ts
row['mphf_s'] = tsdiff(p, 'drop_ts', 'mphf_done_ts')
# After MPHF: mphf_done → done_ts
row['post_s'] = tsdiff(p, 'mphf_done_ts', 'done_ts')
# Graph build: total - known phases (rough estimate)
known = sum(v for v in [row['cdeg_s'], row['trav_s'], row['close_s'], row['drop_s'],
row['mphf_s'], row['mphf_open_s'], row['bld_ready_s'],
row['pass2_s'], row['bld_closed_s']] if v is not None)
row['graph_build_s'] = (row['total_s'] - known) if row['total_s'] else None
phases[pid] = row
# helpers
def collect(key):
return [r[key] for r in phases.values() if r.get(key) is not None]
def rate_stats(n_key, t_key):
"""Returns list of throughput values (items/s)."""
result = []
for r in phases.values():
n, t = r.get(n_key), r.get(t_key)
if n and t and t > 0:
result.append(n / t)
return result
# ── output ────────────────────────────────────────────────────────────────────
out = []
w = out.append
w("# obikmer merge — performance report\n")
# Run info
n_parts = len([r for r in phases.values() if r['n_kmers'] > 0])
n_empty = len([r for r in phases.values() if r['n_kmers'] == 0])
total_kmers = sum(r['n_kmers'] for r in phases.values())
w("## Run summary\n")
w(f"- **Partitions**: {len(phases)} total — {n_parts} non-empty, {n_empty} empty")
w(f"- **New kmers (total)**: {total_kmers:,}")
if wall_total:
w(f"- **merge_partitions wall time**: {fmt_s(wall_total)}")
if workers_final[0]:
w(f"- **Workers spawned**: {workers_final[0]} / {workers_final[1]} (max)")
w("")
# Worker spawn timeline
if workers_ev:
w("## Worker activation\n")
w("| Time | Worker # | Trigger | Efficiency | Gain vs prev |")
w("|------|----------|---------|------------|--------------|")
t0 = workers_ev[0]['ts']
for e in workers_ev:
elapsed = fmt_s((e['ts'] - t0).total_seconds())
trigger = "poll (timeout)" if e['poll'] else "partition done"
gain = f"{e['gain']}%" if e.get('gain') is not None else "—"
w(f"| +{elapsed} | {e['n']} | {trigger} | {e['eff']}% | {gain} |")
w("")
# Phase breakdown table
w("## Phase timing statistics\n")
w("Columns: min | median | mean | max | stdev\n")
w("| Phase | min | median | mean | max | stdev |")
w("|-------|-----|--------|------|-----|-------|")
w(stats_row("Graph build (estimated)", collect('graph_build_s')))
w(stats_row("compute_degrees", collect('cdeg_s')))
w(stats_row("Unitig traversal", collect('trav_s')))
w(stats_row("Writer close (uw.close)", collect('close_s')))
w(stats_row("Graph drop", collect('drop_s')))
w(stats_row("MPHF build", collect('mphf_s')))
w(stats_row("MPHF open", collect('mphf_open_s')))
w(stats_row("Builders ready", collect('bld_ready_s')))
w(stats_row("Pass2 pipeline", collect('pass2_s')))
w(stats_row("Builders close", collect('bld_closed_s')))
w(stats_row("Post-MPHF (residual)", collect('post_s')))
w(stats_row("**Total per partition**", collect('total_s')))
w("")
# Throughput
w("## Throughput by phase\n")
w("| Phase | metric | min | median | mean | max |")
w("|-------|--------|-----|--------|------|-----|")
def rate_row(label, rates):
if not rates: return f"| {label} | — | — | — | — | — |"
f = lambda x: fmt_rate(x, 1)
mn, med, av, mx = min(rates), median(rates), mean(rates), max(rates)
return f"| {label} | nodes/s | {f(mn)} | {f(med)} | {f(av)} | {f(mx)} |"
w(rate_row("compute_degrees", rate_stats('n_nodes', 'cdeg_s')))
w(rate_row("Unitig traversal", rate_stats('n_nodes', 'trav_s')))
w(rate_row("MPHF build", rate_stats('n_unitigs', 'mphf_s')))
w("")
# Top 10 slowest partitions
w("## Top 10 slowest partitions\n")
w("| Partition | nodes | unitigs | total | trav | MPHF | graph build |")
w("|-----------|-------|---------|-------|------|------|-------------|")
sorted_parts = sorted(phases.values(), key=lambda r: r['total_s'] or 0, reverse=True)
for r in sorted_parts[:10]:
pid = r['pid']
def f(k): return fmt_s(r[k]) if r.get(k) is not None else "—"
nodes = f"{r['n_nodes']/1e6:.1f}M" if r['n_nodes'] else "—"
unitigs = f"{r['n_unitigs']/1e6:.1f}M" if r['n_unitigs'] else "—"
w(f"| {pid} | {nodes} | {unitigs} | {f('total_s')} | {f('trav_s')} | {f('mphf_s')} | {f('graph_build_s')} |")
w("")
# Phase share of total time (for non-empty partitions with full data)
complete = [r for r in phases.values()
if all(r.get(k) is not None
for k in ('total_s','trav_s','close_s','drop_s','mphf_s',
'mphf_open_s','bld_ready_s','pass2_s','bld_closed_s'))
and r['total_s'] and r['total_s'] > 0]
if complete:
w("## Phase share of total time (mean across complete partitions)\n")
total_mean = mean(r['total_s'] for r in complete)
w(f"_Based on {len(complete)} partitions with full timing data. Mean total: {fmt_s(total_mean)}_\n")
w("| Phase | mean time | share |")
w("|-------|-----------|-------|")
for label, key in [
("Graph build", 'graph_build_s'),
("compute_degrees", 'cdeg_s'),
("Unitig traversal", 'trav_s'),
("Writer close", 'close_s'),
("Graph drop", 'drop_s'),
("MPHF build", 'mphf_s'),
("MPHF open", 'mphf_open_s'),
("Builders ready", 'bld_ready_s'),
("Pass2 pipeline", 'pass2_s'),
("Builders close", 'bld_closed_s'),
("Post-MPHF (residual)", 'post_s'),
]:
vals = [r[key] for r in complete]
m = mean(vals)
w(f"| {label} | {fmt_s(m)} | {pct(m, total_mean)} |")
w("")
print('\n'.join(out))