feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
Executable
+53
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: aggregate_stats.sh TYPE
|
||||
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
||||
#
|
||||
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
||||
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
||||
# the most recent run CSV (idempotent when nothing changed).
|
||||
set -euo pipefail
|
||||
|
||||
TYPE="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
||||
|
||||
case "${TYPE}" in
|
||||
indexing_presence|indexing_count)
|
||||
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
||||
;;
|
||||
verify_presence)
|
||||
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
||||
;;
|
||||
verify_count)
|
||||
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
||||
;;
|
||||
specific_kmer_presence|specific_kmer_count)
|
||||
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
||||
;;
|
||||
*)
|
||||
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# Find most recent existing run CSV (empty string if none).
|
||||
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
||||
|
||||
# Check if any .stats file is newer than the latest run CSV.
|
||||
if [[ -n "${latest_csv}" ]] && \
|
||||
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
||||
echo "[${TYPE}] stats up to date (${latest_csv})"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
||||
CSV="${STATS_DIR}/run_${run_n}.csv"
|
||||
|
||||
echo "${HEADER}" >"${CSV}"
|
||||
|
||||
# Sort .stats files by name for reproducible row order.
|
||||
while IFS= read -r stats_file; do
|
||||
sed "s/^/${run_n},/" "${stats_file}"
|
||||
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
||||
|
||||
echo "[${TYPE}] run ${run_n} → ${CSV}"
|
||||
Reference in New Issue
Block a user