feat: add benchmark pipeline, expose APIs, and enforce strict paths

Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
Eric Coissac
2026-06-19 09:55:41 +02:00
parent 280ca1f5a3
commit c694e1f2b0
42 changed files with 2585 additions and 84 deletions
+39
View File
@@ -0,0 +1,39 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
REF_DIR="${SCRIPT_DIR}/reference_index"
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
KMER_SIZE="${KMER_SIZE:-31}"
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
mkdir -p "${REF_DIR}"
for species_dir in "${SIMDATA_DIR}"/*/; do
[[ -d "${species_dir}" ]] || continue
species=$(basename "${species_dir}")
for strain_dir in "${species_dir}"*/; do
[[ -d "${strain_dir}" ]] || continue
strain=$(basename "${strain_dir}")
r1="${strain_dir}/reads_R1.fastq.gz"
r2="${strain_dir}/reads_R2.fastq.gz"
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
echo "SKIP ${species}--${strain}: reads not found" >&2
continue
fi
out="${REF_DIR}/${species}--${strain}.npz"
echo "[${species}--${strain}] → ${out}"
"${PYTHON}" "${BUILD_PY}" \
--kmer-size "${KMER_SIZE}" \
--min-abundance "${MIN_ABUNDANCE}" \
--output "${out}" \
"${r1}" "${r2}"
done
done