c694e1f2b0
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
54 lines
2.1 KiB
Bash
Executable File
54 lines
2.1 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Usage: aggregate_stats.sh TYPE
|
|
# TYPE = indexing_presence | indexing_count | verify_presence | verify_count
|
|
#
|
|
# Reads all stats/TYPE/*.stats files (one CSV data row each, no header).
|
|
# Creates a new stats/TYPE/run_NNN.csv only if any .stats file is newer than
|
|
# the most recent run CSV (idempotent when nothing changed).
|
|
set -euo pipefail
|
|
|
|
TYPE="$1"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
STATS_DIR="${SCRIPT_DIR}/stats/${TYPE}"
|
|
|
|
case "${TYPE}" in
|
|
indexing_presence|indexing_count)
|
|
HEADER="run,species,strain,scatter_wall_s,scatter_rss_b,dereplicate_wall_s,dereplicate_rss_b,count_kmer_wall_s,count_kmer_rss_b,index_wall_s,index_rss_b,total_wall_s,total_rss_b"
|
|
;;
|
|
verify_presence)
|
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct"
|
|
;;
|
|
verify_count)
|
|
HEADER="run,species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,fn_pct,fp_pct,cm_pct"
|
|
;;
|
|
specific_kmer_presence|specific_kmer_count)
|
|
HEADER="run,species,rebuild_wall_s,rebuild_rss_b,pack_wall_s,pack_rss_b,filter_total_wall_s,filter_total_rss_b,select_wall_s,select_rss_b,select_total_wall_s,select_total_rss_b"
|
|
;;
|
|
*)
|
|
echo "ERROR: unknown stats type '${TYPE}'" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
# Find most recent existing run CSV (empty string if none).
|
|
latest_csv=$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | sort | tail -1)
|
|
|
|
# Check if any .stats file is newer than the latest run CSV.
|
|
if [[ -n "${latest_csv}" ]] && \
|
|
[[ -z "$(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' -newer "${latest_csv}" 2>/dev/null)" ]]; then
|
|
echo "[${TYPE}] stats up to date (${latest_csv})"
|
|
exit 0
|
|
fi
|
|
|
|
run_n=$(printf '%03d' "$(find "${STATS_DIR}" -maxdepth 1 -name 'run_*.csv' 2>/dev/null | wc -l | tr -d ' ')")
|
|
CSV="${STATS_DIR}/run_${run_n}.csv"
|
|
|
|
echo "${HEADER}" >"${CSV}"
|
|
|
|
# Sort .stats files by name for reproducible row order.
|
|
while IFS= read -r stats_file; do
|
|
sed "s/^/${run_n},/" "${stats_file}"
|
|
done < <(find "${STATS_DIR}" -maxdepth 1 -name '*.stats' | sort) >>"${CSV}"
|
|
|
|
echo "[${TYPE}] run ${run_n} → ${CSV}"
|