119 lines
3.7 KiB
Python
119 lines
3.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Generate deps.mk — pure dependency declarations for the benchmark pipeline.
|
||
|
|
|
||
|
|
Like C .d files: only target: prerequisites lines, no recipes.
|
||
|
|
Recipes stay in the Makefile as generic rules.
|
||
|
|
"""
|
||
|
|
import gzip
|
||
|
|
import re
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
STOP_WORDS = {'complete', 'chromosome', 'whole', 'sequence', 'genome',
|
||
|
|
'endosymbiont', 'of'}
|
||
|
|
STOP_PREFIXES = ('scaffold', 'contig', 'plasmid')
|
||
|
|
|
||
|
|
|
||
|
|
def is_stop(tok):
|
||
|
|
t = tok.lower()
|
||
|
|
return t in STOP_WORDS or any(t.startswith(p) for p in STOP_PREFIXES)
|
||
|
|
|
||
|
|
|
||
|
|
def sanitize(s):
|
||
|
|
return re.sub(r'[^A-Za-z0-9._-]', '_', s).strip('_')
|
||
|
|
|
||
|
|
|
||
|
|
def collect_tokens(text):
|
||
|
|
parts = []
|
||
|
|
for tok in text.split():
|
||
|
|
tok = tok.rstrip(',.')
|
||
|
|
if is_stop(tok):
|
||
|
|
break
|
||
|
|
parts.append(sanitize(tok))
|
||
|
|
return '_'.join(filter(None, parts))
|
||
|
|
|
||
|
|
|
||
|
|
def parse_organism(defn, gcf_id):
|
||
|
|
words = defn.split()
|
||
|
|
species = sanitize(words[0] + '_' + words[1])
|
||
|
|
|
||
|
|
m = re.search(r'\bstr\.\s+(\S+)(?:\s+substr\.\s+(\S+))?', defn)
|
||
|
|
if m:
|
||
|
|
strain = sanitize(m.group(1))
|
||
|
|
if m.group(2):
|
||
|
|
strain += '_' + sanitize(m.group(2))
|
||
|
|
return species, strain
|
||
|
|
|
||
|
|
m = re.search(r'\bstrain\b\s+(.*)', defn)
|
||
|
|
if m:
|
||
|
|
strain = collect_tokens(m.group(1))
|
||
|
|
if strain:
|
||
|
|
return species, strain
|
||
|
|
|
||
|
|
remainder = re.sub(r'^\S+ \S+\s*', '', defn)
|
||
|
|
remainder = re.sub(r'^subsp\.\s+\S+\s*', '', remainder)
|
||
|
|
remainder = re.sub(r'^serovar\s+\S+\s*', '', remainder)
|
||
|
|
strain = collect_tokens(remainder)
|
||
|
|
return species, strain if strain else gcf_id
|
||
|
|
|
||
|
|
|
||
|
|
def first_definition(path):
|
||
|
|
with gzip.open(path, 'rt') as fh:
|
||
|
|
for line in fh:
|
||
|
|
if line.startswith('>'):
|
||
|
|
m = re.search(r'"definition":"([^"]*)"', line)
|
||
|
|
return m.group(1) if m else line[1:].split()[0]
|
||
|
|
return Path(path).stem
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
entries = [] # (specimen, species, sim_dir, genome_path)
|
||
|
|
species_seen = []
|
||
|
|
|
||
|
|
for path in sorted(sys.argv[1:]):
|
||
|
|
gcf_id = Path(path).name.replace('_genomic.fna.gz', '')
|
||
|
|
defn = first_definition(path)
|
||
|
|
sp, st = parse_organism(defn, gcf_id)
|
||
|
|
specimen = f'{sp}--{st}'
|
||
|
|
sim_dir = f'simulated_data/{sp}/{st}'
|
||
|
|
entries.append((specimen, sp, sim_dir, path))
|
||
|
|
if sp not in species_seen:
|
||
|
|
species_seen.append(sp)
|
||
|
|
|
||
|
|
specimens = [e[0] for e in entries]
|
||
|
|
print('SPECIMENS :=', ' '.join(specimens))
|
||
|
|
print('SPECIES :=', ' '.join(species_seen))
|
||
|
|
|
||
|
|
for specimen, species, sim_dir, genome in entries:
|
||
|
|
reads = f'{sim_dir}/reads_R1.fastq.gz'
|
||
|
|
p_done = f'specimen_index_presence/{specimen}/index.done'
|
||
|
|
p_stats = f'stats/indexing_presence/{specimen}.stats'
|
||
|
|
c_done = f'specimen_index_count/{specimen}/index.done'
|
||
|
|
c_stats = f'stats/indexing_count/{specimen}.stats'
|
||
|
|
ref = f'reference_index/{specimen}.npz'
|
||
|
|
vp = f'stats/verify_presence/{specimen}.stats'
|
||
|
|
vc = f'stats/verify_count/{specimen}.stats'
|
||
|
|
|
||
|
|
print()
|
||
|
|
print(f'# {specimen}')
|
||
|
|
print(f'{reads}: {genome}')
|
||
|
|
print(f'{ref}: {reads}')
|
||
|
|
print(f'{p_done} {p_stats}: {reads}')
|
||
|
|
print(f'{c_done} {c_stats}: {reads}')
|
||
|
|
print(f'{vp}: {ref} {p_done}')
|
||
|
|
print(f'{vc}: {ref} {c_done}')
|
||
|
|
|
||
|
|
print()
|
||
|
|
for sp in species_seen:
|
||
|
|
sp_done = f'specific_index_presence/{sp}/index.done'
|
||
|
|
sp_stats = f'stats/specific_kmer_presence/{sp}.stats'
|
||
|
|
sc_done = f'specific_index_count/{sp}/index.done'
|
||
|
|
sc_stats = f'stats/specific_kmer_count/{sp}.stats'
|
||
|
|
print(f'# {sp}')
|
||
|
|
print(f'{sp_done} {sp_stats}: global_index_presence/index.done')
|
||
|
|
print(f'{sc_done} {sc_stats}: global_index_count/index.done')
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|