feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
@@ -0,0 +1,33 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: simulate_one.sh genome.fna.gz output_dir
|
||||
# Simulates paired-end HiSeq reads for a single genome.
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
||||
COVERAGE=15
|
||||
READ_LENGTH=150
|
||||
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
||||
|
||||
genome_file="$1"
|
||||
out_dir="$2"
|
||||
|
||||
mkdir -p "${out_dir}"
|
||||
|
||||
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
||||
trap 'rm -f "${tmp_fasta}"' EXIT
|
||||
|
||||
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
||||
|
||||
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
||||
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
||||
|
||||
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
||||
|
||||
"${ISS}" generate \
|
||||
--genomes "${tmp_fasta}" \
|
||||
--model HiSeq \
|
||||
--n_reads "${n_reads}" \
|
||||
--cpus "${CPUS}" \
|
||||
--compress \
|
||||
--output "${out_dir}/reads"
|
||||
Reference in New Issue
Block a user