#!/usr/bin/env python3 """Generate deps.mk — pure dependency declarations for the benchmark pipeline. Like C .d files: only target: prerequisites lines, no recipes. Recipes stay in the Makefile as generic rules. """ import gzip import re import sys from pathlib import Path STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome', 'endosymbiont', 'of'} STOP_PREFIXES = ('scaffold', 'contig', 'plasmid') def is_stop(tok): t = tok.lower() return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES) def sanitize(s): return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_') def collect_tokens(text): parts = [] for tok in text.split(): tok = tok.rstrip(',.') if is_stop(tok): break parts.append(sanitize(tok)) return '_'.join(filter(None, parts)) def parse_organism(defn, gcf_id): words = defn.split() species = sanitize(words[0] + '_' + words[1]) m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn) if m: strain = sanitize(m.group(1)) if m.group(2): strain += '_' + sanitize(m.group(2)) return species, strain m = re.search(r'\bstrain\b\s+(.*)', defn) if m: strain = collect_tokens(m.group(1)) if strain: return species, strain remainder = re.sub(r'^\S+ \S+\s*', '', defn) remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder) remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder) strain = collect_tokens(remainder) return species, strain if strain else gcf_id def first_definition(path): with gzip.open(path, 'rt') as fh: for line in fh: if line.startswith('>'): m = re.search(r'"definition":"([^"]*)"', line) return m.group(1) if m else line[1:].split()[0] return Path(path).stem def main(): entries = [] # (specimen, species, sim_dir, genome_path) species_seen = [] for path in sorted(sys.argv[1:]): gcf_id = Path(path).name.replace('_genomic.fna.gz', '') defn = first_definition(path) sp, st = parse_organism(defn, gcf_id) specimen = f'{sp}--{st}' sim_dir = f'simulated_data/{sp}/{st}' entries.append((specimen, sp, sim_dir, path)) if sp not in species_seen: species_seen.append(sp) specimens = [e[0] for e in entries] print('SPECIMENS :=', ' '.join(specimens)) print('SPECIES :=', ' '.join(species_seen)) for specimen, species, sim_dir, genome in entries: reads = f'{sim_dir}/reads_R1.fastq.gz' p_done = f'specimen_index_presence/{specimen}/index.done' p_stats = f'stats/indexing_presence/{specimen}.stats' c_done = f'specimen_index_count/{specimen}/index.done' c_stats = f'stats/indexing_count/{specimen}.stats' ref = f'reference_index/{specimen}.npz' vp = f'stats/verify_presence/{specimen}.stats' vc = f'stats/verify_count/{specimen}.stats' print() print(f'# {specimen}') print(f'{reads}: {genome}') print(f'{ref}: {reads}') print(f'{p_done} {p_stats}: {reads}') print(f'{c_done} {c_stats}: {reads}') print(f'{vp}: {ref} {p_done}') print(f'{vc}: {ref} {c_done}') print() for sp in species_seen: sp_done = f'specific_index_presence/{sp}/index.done' sp_stats = f'stats/specific_kmer_presence/{sp}.stats' sc_done = f'specific_index_count/{sp}/index.done' sc_stats = f'stats/specific_kmer_count/{sp}.stats' print(f'# {sp}') print(f'{sp_done} {sp_stats}: global_index_presence/index.done') print(f'{sc_done} {sc_stats}: global_index_count/index.done') if __name__ == '__main__': main()