Files
obikmer/benchmark/make_deps.py
Eric Coissac c694e1f2b0 feat: add benchmark pipeline, expose APIs, and enforce strict paths
Introduces a Make-based orchestration for simulating, indexing, merging, filtering, and verifying k-mer counts and presence. Exposes internal builder and iterator APIs publicly, enforces mandatory leading slashes for predicate patterns, registers the `obitaxonomy` crate, and updates tooling configurations alongside documentation.
2026-06-22 10:18:33 +02:00

119 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
Like C .d files: only target: prerequisites lines, no recipes.
Recipes stay in the Makefile as generic rules.
"""
import gzip
import re
import sys
from pathlib import Path
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
'endosymbiont', 'of'}
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
def is_stop(tok):
t = tok.lower()
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
def sanitize(s):
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
def collect_tokens(text):
parts = []
for tok in text.split():
tok = tok.rstrip(',.')
if is_stop(tok):
break
parts.append(sanitize(tok))
return '_'.join(filter(None, parts))
def parse_organism(defn, gcf_id):
words = defn.split()
species = sanitize(words[0] + '_' + words[1])
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
if m:
strain = sanitize(m.group(1))
if m.group(2):
strain += '_' + sanitize(m.group(2))
return species, strain
m = re.search(r'\bstrain\b\s+(.*)', defn)
if m:
strain = collect_tokens(m.group(1))
if strain:
return species, strain
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
strain = collect_tokens(remainder)
return species, strain if strain else gcf_id
def first_definition(path):
with gzip.open(path, 'rt') as fh:
for line in fh:
if line.startswith('>'):
m = re.search(r'"definition":"([^"]*)"', line)
return m.group(1) if m else line[1:].split()[0]
return Path(path).stem
def main():
entries = [] # (specimen, species, sim_dir, genome_path)
species_seen = []
for path in sorted(sys.argv[1:]):
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
defn = first_definition(path)
sp, st = parse_organism(defn, gcf_id)
specimen = f'{sp}--{st}'
sim_dir = f'simulated_data/{sp}/{st}'
entries.append((specimen, sp, sim_dir, path))
if sp not in species_seen:
species_seen.append(sp)
specimens = [e[0] for e in entries]
print('SPECIMENS :=', ' '.join(specimens))
print('SPECIES :=', ' '.join(species_seen))
for specimen, species, sim_dir, genome in entries:
reads = f'{sim_dir}/reads_R1.fastq.gz'
p_done = f'specimen_index_presence/{specimen}/index.done'
p_stats = f'stats/indexing_presence/{specimen}.stats'
c_done = f'specimen_index_count/{specimen}/index.done'
c_stats = f'stats/indexing_count/{specimen}.stats'
ref = f'reference_index/{specimen}.npz'
vp = f'stats/verify_presence/{specimen}.stats'
vc = f'stats/verify_count/{specimen}.stats'
print()
print(f'# {specimen}')
print(f'{reads}: {genome}')
print(f'{ref}: {reads}')
print(f'{p_done} {p_stats}: {reads}')
print(f'{c_done} {c_stats}: {reads}')
print(f'{vp}: {ref} {p_done}')
print(f'{vc}: {ref} {c_done}')
print()
for sp in species_seen:
sp_done = f'specific_index_presence/{sp}/index.done'
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
sc_done = f'specific_index_count/{sp}/index.done'
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
print(f'# {sp}')
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
if __name__ == '__main__':
main()