feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
Executable
+39
@@ -0,0 +1,39 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
SIMDATA_DIR="${SCRIPT_DIR}/simulated_data"
|
||||
REF_DIR="${SCRIPT_DIR}/reference_index"
|
||||
PYTHON="${SCRIPT_DIR}/../.venv/bin/python3"
|
||||
BUILD_PY="${SCRIPT_DIR}/build_reference.py"
|
||||
|
||||
KMER_SIZE="${KMER_SIZE:-31}"
|
||||
MIN_ABUNDANCE="${MIN_ABUNDANCE:-1}"
|
||||
|
||||
mkdir -p "${REF_DIR}"
|
||||
|
||||
for species_dir in "${SIMDATA_DIR}"/*/; do
|
||||
[[ -d "${species_dir}" ]] || continue
|
||||
species=$(basename "${species_dir}")
|
||||
|
||||
for strain_dir in "${species_dir}"*/; do
|
||||
[[ -d "${strain_dir}" ]] || continue
|
||||
strain=$(basename "${strain_dir}")
|
||||
|
||||
r1="${strain_dir}/reads_R1.fastq.gz"
|
||||
r2="${strain_dir}/reads_R2.fastq.gz"
|
||||
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||
echo "SKIP ${species}--${strain}: reads not found" >&2
|
||||
continue
|
||||
fi
|
||||
|
||||
out="${REF_DIR}/${species}--${strain}.npz"
|
||||
echo "[${species}--${strain}] → ${out}"
|
||||
|
||||
"${PYTHON}" "${BUILD_PY}" \
|
||||
--kmer-size "${KMER_SIZE}" \
|
||||
--min-abundance "${MIN_ABUNDANCE}" \
|
||||
--output "${out}" \
|
||||
"${r1}" "${r2}"
|
||||
done
|
||||
done
|
||||
Reference in New Issue
Block a user