#!/usr/bin/env bash # Usage: index_one_count.sh SPECIMEN # SPECIMEN = "species--strain" (Make pattern stem) # Outputs: # specimen_index_count/SPECIMEN/index.done (written by obikmer) # stats/indexing_count/SPECIMEN.stats (one CSV data row, no header) set -euo pipefail SPECIMEN="$1" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" BINARY="${SCRIPT_DIR}/../src/target/release/obikmer" species="${SPECIMEN%%--*}" strain="${SPECIMEN#*--}" READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}" INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}" STATS_DIR="${SCRIPT_DIR}/stats/indexing_count" STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats" mkdir -p "${STATS_DIR}" r1="${READS_DIR}/reads_R1.fastq.gz" r2="${READS_DIR}/reads_R2.fastq.gz" if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then echo "ERROR: reads not found in ${READS_DIR}" >&2 exit 1 fi echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}" STDERR_LOG=$(mktemp) trap 'rm -f "${STDERR_LOG}"' EXIT "${BINARY}" index \ --output "${INDEX_PATH}" \ --force \ --theta 0 \ --with-counts \ --label "${SPECIMEN}" \ --meta "species=${species}" \ "${r1}" "${r2}" \ 2>"${STDERR_LOG}" cat "${STDERR_LOG}" >&2 python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}" import sys, re species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3] def strip_ansi(s): return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s) def parse_wall(s): s = s.strip() if s.endswith('ms'): return float(s[:-2]) / 1000.0 if s.endswith('s'): return float(s[:-1]) return 0.0 def parse_rss(s): m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip()) if not m: return 0 return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)]) def is_sep(s): return bool(s) and not re.search(r'[A-Za-z0-9]', s) stats = {} state = 'scan' with open(logfile, errors='replace') as fh: for raw in fh: line = strip_ansi(raw.rstrip('\n')) s = line.strip() if state == 'scan': if re.search(r'\bstage\b.*\bwall\b', line): state = 'in_header' elif state == 'in_header': if is_sep(s): state = 'rows' elif state == 'rows': if is_sep(s): state = 'total' elif s: parts = re.split(r' +', s) if len(parts) >= 4: stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3])) elif state == 'total': if s: parts = re.split(r' +', s) if len(parts) >= 3: stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]) if len(parts) > 3 else 0) break STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index'] row = [species, strain] for stage in STAGE_ORDER: w, r = stats.get(stage, ('', '')) row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)] tw, tr = stats.get('TOTAL', ('', '')) row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)] print(','.join(row)) PYEOF