feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
This commit is contained in:
Executable
+102
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env bash
|
||||
# Usage: index_one_presence.sh SPECIMEN
|
||||
# SPECIMEN = "species--strain" (Make pattern stem)
|
||||
# Outputs:
|
||||
# specimen_index_presence/SPECIMEN/index.done (written by obikmer)
|
||||
# stats/indexing_presence/SPECIMEN.stats (one CSV data row, no header)
|
||||
set -euo pipefail
|
||||
|
||||
SPECIMEN="$1"
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
|
||||
|
||||
species="${SPECIMEN%%--*}"
|
||||
strain="${SPECIMEN#*--}"
|
||||
|
||||
READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
|
||||
INDEX_PATH="${SCRIPT_DIR}/specimen_index_presence/${SPECIMEN}"
|
||||
STATS_DIR="${SCRIPT_DIR}/stats/indexing_presence"
|
||||
STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
|
||||
|
||||
mkdir -p "${STATS_DIR}"
|
||||
|
||||
r1="${READS_DIR}/reads_R1.fastq.gz"
|
||||
r2="${READS_DIR}/reads_R2.fastq.gz"
|
||||
if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
|
||||
echo "ERROR: reads not found in ${READS_DIR}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[${SPECIMEN}] indexing (presence) → ${INDEX_PATH}"
|
||||
|
||||
STDERR_LOG=$(mktemp)
|
||||
trap 'rm -f "${STDERR_LOG}"' EXIT
|
||||
|
||||
"${BINARY}" index \
|
||||
--output "${INDEX_PATH}" \
|
||||
--force \
|
||||
--theta 0 \
|
||||
--label "${SPECIMEN}" \
|
||||
--meta "species=${species}" \
|
||||
"${r1}" "${r2}" \
|
||||
2>"${STDERR_LOG}"
|
||||
|
||||
cat "${STDERR_LOG}" >&2
|
||||
|
||||
python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
|
||||
import sys, re
|
||||
|
||||
species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
|
||||
|
||||
def strip_ansi(s):
|
||||
return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
|
||||
|
||||
def parse_wall(s):
|
||||
s = s.strip()
|
||||
if s.endswith('ms'): return float(s[:-2]) / 1000.0
|
||||
if s.endswith('s'): return float(s[:-1])
|
||||
return 0.0
|
||||
|
||||
def parse_rss(s):
|
||||
m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
|
||||
if not m: return 0
|
||||
return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
|
||||
|
||||
def is_sep(s):
|
||||
return bool(s) and not re.search(r'[A-Za-z0-9]', s)
|
||||
|
||||
stats = {}
|
||||
state = 'scan'
|
||||
|
||||
with open(logfile, errors='replace') as fh:
|
||||
for raw in fh:
|
||||
line = strip_ansi(raw.rstrip('\n'))
|
||||
s = line.strip()
|
||||
if state == 'scan':
|
||||
if re.search(r'\bstage\b.*\bwall\b', line):
|
||||
state = 'in_header'
|
||||
elif state == 'in_header':
|
||||
if is_sep(s): state = 'rows'
|
||||
elif state == 'rows':
|
||||
if is_sep(s): state = 'total'
|
||||
elif s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 4:
|
||||
stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
|
||||
elif state == 'total':
|
||||
if s:
|
||||
parts = re.split(r' +', s)
|
||||
if len(parts) >= 3:
|
||||
stats[parts[0]] = (parse_wall(parts[1]),
|
||||
parse_rss(parts[3]) if len(parts) > 3 else 0)
|
||||
break
|
||||
|
||||
STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
|
||||
row = [species, strain]
|
||||
for stage in STAGE_ORDER:
|
||||
w, r = stats.get(stage, ('', ''))
|
||||
row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
|
||||
tw, tr = stats.get('TOTAL', ('', ''))
|
||||
row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
|
||||
print(','.join(row))
|
||||
PYEOF
|
||||
Reference in New Issue
Block a user