Files

34 lines
1021 B
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# Usage: simulate_one.sh genome.fna.gz output_dir
# Simulates paired-end HiSeq reads for a single genome.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
COVERAGE=15
READ_LENGTH=150
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
genome_file="$1"
out_dir="$2"
mkdir -p "${out_dir}"
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
trap 'rm -f "${tmp_fasta}"' EXIT
gzip -dc "${genome_file}" > "${tmp_fasta}"
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
"${ISS}" generate \
--genomes "${tmp_fasta}" \
--model HiSeq \
--n_reads "${n_reads}" \
--cpus "${CPUS}" \
--compress \
--output "${out_dir}/reads"