104 lines
3.1 KiB
Bash
104 lines
3.1 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
|
|
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||
|
|
IDX_DIR="${SCRIPT_DIR}/specimen_index_count"
|
||
|
|
OUTPUT="${SCRIPT_DIR}/global_index_count"
|
||
|
|
STATS_DIR="${SCRIPT_DIR}/stats/merge_count"
|
||
|
|
|
||
|
|
mkdir -p "${STATS_DIR}"
|
||
|
|
|
||
|
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' | wc -l | tr -d ' ')")
|
||
|
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||
|
|
|
||
|
|
printf 'run,n_sources,bootstrap_wall_s,bootstrap_rss_b,spectrums_wall_s,spectrums_rss_b,merge_partitions_wall_s,merge_partitions_rss_b,pack_wall_s,pack_rss_b,total_wall_s,total_rss_b\n' >"${CSV}"
|
||
|
|
|
||
|
|
parse_reporter() {
|
||
|
|
local run="$1" n_sources="$2" logfile="$3"
|
||
|
|
python3 - "$run" "$n_sources" "$logfile" <<'PYEOF'
|
||
|
|
import sys, re
|
||
|
|
|
||
|
|
run, n_sources, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||
|
|
|
||
|
|
def strip_ansi(s):
|
||
|
|
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||
|
|
|
||
|
|
def parse_wall(s):
|
||
|
|
s = s.strip()
|
||
|
|
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||
|
|
if s.endswith('s'): return float(s[:-1])
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
def parse_rss(s):
|
||
|
|
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||
|
|
if not m: return 0
|
||
|
|
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||
|
|
|
||
|
|
def is_sep(s):
|
||
|
|
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||
|
|
|
||
|
|
stats = {}
|
||
|
|
state = 'scan'
|
||
|
|
|
||
|
|
with open(logfile, errors='replace') as fh:
|
||
|
|
for raw in fh:
|
||
|
|
line = strip_ansi(raw.rstrip('\n'))
|
||
|
|
s = line.strip()
|
||
|
|
|
||
|
|
if state == 'scan':
|
||
|
|
if re.search(r'\bstage\b.*\bwall\b', line):
|
||
|
|
state = 'in_header'
|
||
|
|
elif state == 'in_header':
|
||
|
|
if is_sep(s):
|
||
|
|
state = 'rows'
|
||
|
|
elif state == 'rows':
|
||
|
|
if is_sep(s):
|
||
|
|
state = 'total'
|
||
|
|
elif s:
|
||
|
|
parts = re.split(r' +', s)
|
||
|
|
if len(parts) >= 4:
|
||
|
|
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||
|
|
elif state == 'total':
|
||
|
|
if s:
|
||
|
|
parts = re.split(r' +', s)
|
||
|
|
if len(parts) >= 3:
|
||
|
|
stats[parts[0]] = (parse_wall(parts[1]),
|
||
|
|
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||
|
|
break
|
||
|
|
|
||
|
|
STAGE_ORDER = ['bootstrap', 'spectrums', 'merge_partitions', 'pack']
|
||
|
|
row = [run, n_sources]
|
||
|
|
for stage in STAGE_ORDER:
|
||
|
|
w, r = stats.get(stage, ('', ''))
|
||
|
|
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||
|
|
tw, tr = stats.get('TOTAL', ('', ''))
|
||
|
|
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||
|
|
print(','.join(row))
|
||
|
|
PYEOF
|
||
|
|
}
|
||
|
|
|
||
|
|
mapfile -t sources < <(find "${IDX_DIR}" -mindepth 1 -maxdepth 1 -type d | sort)
|
||
|
|
|
||
|
|
if [[ ${#sources[@]} -eq 0 ]]; then
|
||
|
|
echo "ERROR: no indexes found in ${IDX_DIR}" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "Merging ${#sources[@]} count indexes → ${OUTPUT}"
|
||
|
|
printf ' %s\n' "${sources[@]}"
|
||
|
|
|
||
|
|
STDERR_LOG=$(mktemp)
|
||
|
|
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||
|
|
|
||
|
|
"${BINARY}" merge \
|
||
|
|
--output "${OUTPUT}" \
|
||
|
|
--force \
|
||
|
|
"${sources[@]}" \
|
||
|
|
2>"${STDERR_LOG}"
|
||
|
|
|
||
|
|
cat "${STDERR_LOG}" >&2
|
||
|
|
parse_reporter "${run_n}" "${#sources[@]}" "${STDERR_LOG}" >>"${CSV}"
|
||
|
|
|
||
|
|
echo "Done. Run ${run_n} → ${CSV}"
|