feat: add benchmark pipeline, expose APIs, and enforce strict paths

Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
2026-06-19 09:55:41 +02:00
parent 280ca1f5a3
commit c694e1f2b0
42 changed files with 2585 additions and 84 deletions
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+# Usage: index_one_count.sh SPECIMEN
+# SPECIMEN = "species--strain" (Make pattern stem)
+# Outputs:
+#   specimen_index_count/SPECIMEN/index.done  (written by obikmer)
+#   stats/indexing_count/SPECIMEN.stats       (one CSV data row, no header)
+set -euo pipefail
+
+SPECIMEN="$1"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BINARY="${SCRIPT_DIR}/../src/target/release/obikmer"
+
+species="${SPECIMEN%%--*}"
+strain="${SPECIMEN#*--}"
+
+READS_DIR="${SCRIPT_DIR}/simulated_data/${species}/${strain}"
+INDEX_PATH="${SCRIPT_DIR}/specimen_index_count/${SPECIMEN}"
+STATS_DIR="${SCRIPT_DIR}/stats/indexing_count"
+STATS_FILE="${STATS_DIR}/${SPECIMEN}.stats"
+
+mkdir -p "${STATS_DIR}"
+
+r1="${READS_DIR}/reads_R1.fastq.gz"
+r2="${READS_DIR}/reads_R2.fastq.gz"
+if [[ ! -f "${r1}" || ! -f "${r2}" ]]; then
+    echo "ERROR: reads not found in ${READS_DIR}" >&2
+    exit 1
+fi
+
+echo "[${SPECIMEN}] indexing (count) → ${INDEX_PATH}"
+
+STDERR_LOG=$(mktemp)
+trap 'rm -f "${STDERR_LOG}"' EXIT
+
+"${BINARY}" index \
+    --output "${INDEX_PATH}" \
+    --force \
+    --theta 0 \
+    --with-counts \
+    --label "${SPECIMEN}" \
+    --meta  "species=${species}" \
+    "${r1}" "${r2}" \
+    2>"${STDERR_LOG}"
+
+cat "${STDERR_LOG}" >&2
+
+python3 - "${species}" "${strain}" "${STDERR_LOG}" <<'PYEOF' >"${STATS_FILE}"
+import sys, re
+
+species, strain, logfile = sys.argv[1], sys.argv[2], sys.argv[3]
+
+def strip_ansi(s):
+    return re.sub(r'\x1b\[[\x30-\x3f]*[\x20-\x2f]*[\x40-\x7e]', '', s)
+
+def parse_wall(s):
+    s = s.strip()
+    if s.endswith('ms'): return float(s[:-2]) / 1000.0
+    if s.endswith('s'):  return float(s[:-1])
+    return 0.0
+
+def parse_rss(s):
+    m = re.match(r'([\d.]+)\s*(GB|MB|KB|B)', s.strip())
+    if not m: return 0
+    return int(float(m.group(1)) * {'GB': 1<<30, 'MB': 1<<20, 'KB': 1024, 'B': 1}[m.group(2)])
+
+def is_sep(s):
+    return bool(s) and not re.search(r'[A-Za-z0-9]', s)
+
+stats = {}
+state = 'scan'
+
+with open(logfile, errors='replace') as fh:
+    for raw in fh:
+        line = strip_ansi(raw.rstrip('\n'))
+        s    = line.strip()
+        if state == 'scan':
+            if re.search(r'\bstage\b.*\bwall\b', line):
+                state = 'in_header'
+        elif state == 'in_header':
+            if is_sep(s): state = 'rows'
+        elif state == 'rows':
+            if is_sep(s): state = 'total'
+            elif s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 4:
+                    stats[parts[0]] = (parse_wall(parts[1]), parse_rss(parts[3]))
+        elif state == 'total':
+            if s:
+                parts = re.split(r'  +', s)
+                if len(parts) >= 3:
+                    stats[parts[0]] = (parse_wall(parts[1]),
+                                       parse_rss(parts[3]) if len(parts) > 3 else 0)
+            break
+
+STAGE_ORDER = ['scatter', 'dereplicate', 'count_kmer', 'index']
+row = [species, strain]
+for stage in STAGE_ORDER:
+    w, r = stats.get(stage, ('', ''))
+    row += [f'{w:.3f}' if isinstance(w, float) else '', str(r)]
+tw, tr = stats.get('TOTAL', ('', ''))
+row += [f'{tw:.3f}' if isinstance(tw, float) else '', str(tr)]
+print(','.join(row))
+PYEOF