Files
obikmer/benchmark/aggregate_stats.sh
T
Eric Coissac c694e1f2b0 feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
2026-06-22 10:18:33 +02:00

54 lines
2.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Usage: aggregate_stats.sh TYPE
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
#
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
# the most recent run CSV (idempotent when nothing changed).
set -euo pipefail
TYPE="$1"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
case "${TYPE}" in
indexing_presence|indexing_count)
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
;;
verify_presence)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
;;
verify_count)
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
;;
specific_kmer_presence|specific_kmer_count)
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
;;
*)
echo "ERROR: unknown stats type '${TYPE}'" >&2
exit 1
;;
esac
# Find most recent existing run CSV (empty string if none).
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
# Check if any .stats file is newer than the latest run CSV.
if [[ -n "${latest_csv}" ]] && \
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
echo "[${TYPE}] stats up to date (${latest_csv})"
exit 0
fi
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
CSV="${STATS_DIR}/run_${run_n}.csv"
echo "${HEADER}" >"${CSV}"
# Sort .stats files by name for reproducible row order.
while IFS= read -r stats_file; do
sed "s/^/${run_n},/" "${stats_file}"
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
echo "[${TYPE}] run ${run_n}${CSV}"