obikmer/benchmark/make_deps.py

#!/usr/bin/env python3
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.

Like C .d files: only target: prerequisites lines, no recipes.
Recipes stay in the Makefile as generic rules.
"""
import gzip
import re
import sys
from pathlib import Path

STOP_WORDS    = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
                 'endosymbiont', 'of'}
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')


def is_stop(tok):
    t = tok.lower()
    return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)


def sanitize(s):
    return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')


def collect_tokens(text):
    parts = []
    for tok in text.split():
        tok = tok.rstrip(',.')
        if is_stop(tok):
            break
        parts.append(sanitize(tok))
    return '_'.join(filter(None, parts))


def parse_organism(defn, gcf_id):
    words   = defn.split()
    species = sanitize(words[0] + '_' + words[1])

    m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
    if m:
        strain = sanitize(m.group(1))
        if m.group(2):
            strain += '_' + sanitize(m.group(2))
        return species, strain

    m = re.search(r'\bstrain\b\s+(.*)', defn)
    if m:
        strain = collect_tokens(m.group(1))
        if strain:
            return species, strain

    remainder = re.sub(r'^\S+ \S+\s*', '', defn)
    remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
    remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
    strain    = collect_tokens(remainder)
    return species, strain if strain else gcf_id


def first_definition(path):
    with gzip.open(path, 'rt') as fh:
        for line in fh:
            if line.startswith('>'):
                m = re.search(r'"definition":"([^"]*)"', line)
                return m.group(1) if m else line[1:].split()[0]
    return Path(path).stem


def main():
    entries = []   # (specimen, species, sim_dir, genome_path)
    species_seen = []

    for path in sorted(sys.argv[1:]):
        gcf_id  = Path(path).name.replace('_genomic.fna.gz', '')
        defn    = first_definition(path)
        sp, st  = parse_organism(defn, gcf_id)
        specimen = f'{sp}--{st}'
        sim_dir  = f'simulated_data/{sp}/{st}'
        entries.append((specimen, sp, sim_dir, path))
        if sp not in species_seen:
            species_seen.append(sp)

    specimens = [e[0] for e in entries]
    print('SPECIMENS :=', ' '.join(specimens))
    print('SPECIES   :=', ' '.join(species_seen))

    for specimen, species, sim_dir, genome in entries:
        reads = f'{sim_dir}/reads_R1.fastq.gz'
        p_done  = f'specimen_index_presence/{specimen}/index.done'
        p_stats = f'stats/indexing_presence/{specimen}.stats'
        c_done  = f'specimen_index_count/{specimen}/index.done'
        c_stats = f'stats/indexing_count/{specimen}.stats'
        ref     = f'reference_index/{specimen}.npz'
        vp      = f'stats/verify_presence/{specimen}.stats'
        vc      = f'stats/verify_count/{specimen}.stats'

        print()
        print(f'# {specimen}')
        print(f'{reads}: {genome}')
        print(f'{ref}: {reads}')
        print(f'{p_done} {p_stats}: {reads}')
        print(f'{c_done} {c_stats}: {reads}')
        print(f'{vp}: {ref} {p_done}')
        print(f'{vc}: {ref} {c_done}')

    print()
    for sp in species_seen:
        sp_done  = f'specific_index_presence/{sp}/index.done'
        sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
        sc_done  = f'specific_index_count/{sp}/index.done'
        sc_stats = f'stats/specific_kmer_count/{sp}.stats'
        print(f'# {sp}')
        print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
        print(f'{sc_done} {sc_stats}: global_index_count/index.done')


if __name__ == '__main__':
    main()