#!/usr/bin/env python3 """Verify the merged presence index against all per-specimen reference sets. Streams `obikmer dump` once on the merged index, accumulates per-specimen kmer sets from each column, then compares each against its reference .npz. Output to stdout: one CSV row per specimen (same columns as verify_presence.py) species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct """ import argparse import subprocess import sys from pathlib import Path import numpy as np # ── encoding ────────────────────────────────────────────────────────────────── _ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'a': 0, 'c': 1, 'g': 2, 't': 3} _DECODE = ['A', 'C', 'G', 'T'] def encode_kmer(s: str) -> int: kmer = 0 for c in s: kmer = (kmer << 2) | _ENCODE[c] return kmer def decode_kmer(val: int, k: int) -> str: bases = [] for _ in range(k): bases.append(_DECODE[val & 3]) val >>= 2 return ''.join(reversed(bases)) # ── single-pass dump ────────────────────────────────────────────────────────── def stream_merged_dump(obikmer_bin: str, index_dir: str, ) -> tuple[list[str], dict[str, list[int]]]: """Stream the merged dump once. Returns: specimen_names : column labels in dump order (excluding 'kmer') per_specimen : mapping label → list of kmer ints where presence > 0 """ cmd = [obikmer_bin, 'dump', index_dir] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True) header_line = proc.stdout.readline().rstrip('\n') cols = header_line.split(',') specimen_names = cols[1:] # first col is 'kmer' per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names} for line in proc.stdout: parts = line.rstrip('\n').split(',') kmer_int = encode_kmer(parts[0]) for i, name in enumerate(specimen_names): if int(parts[i + 1]) > 0: per_specimen[name].append(kmer_int) proc.wait() if proc.returncode != 0: print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr) sys.exit(1) return specimen_names, per_specimen # ── per-specimen comparison ─────────────────────────────────────────────────── def compare_specimen(name: str, kmer_list: list[int], ref_dir: Path, k: int, save_fn: Path | None, save_fp: Path | None, ) -> str: """Compare one specimen column against its reference .npz. Returns a CSV row string. """ ref_path = ref_dir / f'{name}.npz' if not ref_path.exists(): print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr) return '' species = name.split('--')[0] strain = name[len(species) + 2:] ref_kmers = np.load(ref_path)['kmers'] # sorted uint64 idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64) false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True) false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True) fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0 fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0 print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} ' f'fn={len(false_neg):,} ({fn_pct:.4f}%) ' f'fp={len(false_pos):,} ({fp_pct:.4f}%)', file=sys.stderr) if save_fn and len(false_neg): fn_file = save_fn / f'{name}_fn.txt' fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n') if save_fp and len(false_pos): fp_file = save_fp / f'{name}_fp.txt' fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n') return (f'{species},{strain},' f'{len(ref_kmers)},{len(idx_kmers)},' f'{len(false_neg)},{len(false_pos)},' f'{fn_pct:.4f},{fp_pct:.4f}') # ── main ───────────────────────────────────────────────────────────────────── def main() -> None: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument('index', metavar='INDEX_DIR', nargs='?', help='Merged presence index directory') ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?', help='Directory containing per-specimen .npz reference files') ap.add_argument('--obikmer', default='obikmer') ap.add_argument('--header', action='store_true', help='Print CSV header and exit') ap.add_argument('--save-fn', metavar='DIR', help='Directory to save false-negative kmer lists') ap.add_argument('--save-fp', metavar='DIR', help='Directory to save false-positive kmer lists') args = ap.parse_args() if args.header: print('species,strain,ref_kmers,idx_kmers,' 'false_neg,false_pos,fn_pct,fp_pct') return ref_dir = Path(args.ref_dir) save_fn = Path(args.save_fn) if args.save_fn else None save_fp = Path(args.save_fp) if args.save_fp else None if save_fn: save_fn.mkdir(parents=True, exist_ok=True) if save_fp: save_fp.mkdir(parents=True, exist_ok=True) # Detect k out1 = subprocess.check_output( [args.obikmer, 'dump', '--head', '1', args.index], stderr=subprocess.DEVNULL, text=True) k = len(out1.splitlines()[1].split(',')[0]) print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr) specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index) print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr) for name in specimen_names: row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp) if row: print(row) if __name__ == '__main__': main()