#!/usr/bin/env bash # Usage: aggregate_stats.sh TYPE # TYPE = indexing_presence | indexing_count | verify_presence | verify_count # # Reads all stats/TYPE/*.stats files (one CSV data row each, no header). # Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than # the most recent run CSV (idempotent when nothing changed). set -euo pipefail TYPE="$1" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}" case "${TYPE}" in indexing_presence|indexing_count) HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b" ;; verify_presence) HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct" ;; verify_count) HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct" ;; specific_kmer_presence|specific_kmer_count) HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b" ;; *) echo "ERROR: unknown stats type '${TYPE}'" >&2 exit 1 ;; esac # Find most recent existing run CSV (empty string if none). latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1) # Check if any .stats file is newer than the latest run CSV. if [[ -n "${latest_csv}" ]] && \ [[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then echo "[${TYPE}] stats up to date (${latest_csv})" exit 0 fi run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')") CSV="${STATS_DIR}/run_${run_n}.csv" echo "${HEADER}" >"${CSV}" # Sort .stats files by name for reproducible row order. while IFS= read -r stats_file; do sed "s/^/${run_n},/" "${stats_file}" done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}" echo "[${TYPE}] run ${run_n} → ${CSV}"