Files
obikmer/benchmark/simulate_one.sh
Eric Coissac c694e1f2b0 feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
2026-06-22 10:18:33 +02:00

34 lines
1021 B
Bash

#!/usr/bin/env bash
# Usage: simulate_one.sh genome.fna.gz output_dir
# Simulates paired-end HiSeq reads for a single genome.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
COVERAGE=15
READ_LENGTH=150
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
genome_file="$1"
out_dir="$2"
mkdir -p "${out_dir}"
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
trap 'rm -f "${tmp_fasta}"' EXIT
gzip -dc "${genome_file}" > "${tmp_fasta}"
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
"${ISS}" generate \
--genomes "${tmp_fasta}" \
--model HiSeq \
--n_reads "${n_reads}" \
--cpus "${CPUS}" \
--compress \
--output "${out_dir}/reads"