c694e1f2b0
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
34 lines
1021 B
Bash
34 lines
1021 B
Bash
#!/usr/bin/env bash
|
|
# Usage: simulate_one.sh genome.fna.gz output_dir
|
|
# Simulates paired-end HiSeq reads for a single genome.
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
ISS="${SCRIPT_DIR}/../.venv/bin/iss"
|
|
COVERAGE=15
|
|
READ_LENGTH=150
|
|
CPUS="${CPUS:-$(sysctl -n hw.logicalcpu 2>/dev/null || nproc 2>/dev/null || echo 2)}"
|
|
|
|
genome_file="$1"
|
|
out_dir="$2"
|
|
|
|
mkdir -p "${out_dir}"
|
|
|
|
tmp_fasta=$(mktemp "${TMPDIR:-/tmp}/obikmer_XXXXXX.fna")
|
|
trap 'rm -f "${tmp_fasta}"' EXIT
|
|
|
|
gzip -dc "${genome_file}" > "${tmp_fasta}"
|
|
|
|
genome_size=$(grep -v "^>" "${tmp_fasta}" | tr -d '[:space:]' | wc -c | tr -d ' ')
|
|
n_reads=$(python3 -c "import math; print(math.ceil(${COVERAGE} * ${genome_size} / (2 * ${READ_LENGTH})))")
|
|
|
|
echo "[${out_dir}] genome=${genome_size} bp → ${n_reads} read pairs (${COVERAGE}x HiSeq)"
|
|
|
|
"${ISS}" generate \
|
|
--genomes "${tmp_fasta}" \
|
|
--model HiSeq \
|
|
--n_reads "${n_reads}" \
|
|
--cpus "${CPUS}" \
|
|
--compress \
|
|
--output "${out_dir}/reads"
|