171 lines
6.4 KiB
Python
171 lines
6.4 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""Verify the merged presence index against all per-specimen reference sets.
|
||
|
|
|
||
|
|
Streams `obikmer dump` once on the merged index, accumulates per-specimen
|
||
|
|
kmer sets from each column, then compares each against its reference .npz.
|
||
|
|
|
||
|
|
Output to stdout: one CSV row per specimen (same columns as verify_presence.py)
|
||
|
|
species,strain,ref_kmers,idx_kmers,false_neg,false_pos,fn_pct,fp_pct
|
||
|
|
"""
|
||
|
|
import argparse
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
|
||
|
|
# ── encoding ──────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
|
||
|
|
'a': 0, 'c': 1, 'g': 2, 't': 3}
|
||
|
|
|
||
|
|
_DECODE = ['A', 'C', 'G', 'T']
|
||
|
|
|
||
|
|
|
||
|
|
def encode_kmer(s: str) -> int:
|
||
|
|
kmer = 0
|
||
|
|
for c in s:
|
||
|
|
kmer = (kmer << 2) | _ENCODE[c]
|
||
|
|
return kmer
|
||
|
|
|
||
|
|
|
||
|
|
def decode_kmer(val: int, k: int) -> str:
|
||
|
|
bases = []
|
||
|
|
for _ in range(k):
|
||
|
|
bases.append(_DECODE[val & 3])
|
||
|
|
val >>= 2
|
||
|
|
return ''.join(reversed(bases))
|
||
|
|
|
||
|
|
|
||
|
|
# ── single-pass dump ──────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def stream_merged_dump(obikmer_bin: str, index_dir: str,
|
||
|
|
) -> tuple[list[str], dict[str, list[int]]]:
|
||
|
|
"""Stream the merged dump once.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
specimen_names : column labels in dump order (excluding 'kmer')
|
||
|
|
per_specimen : mapping label → list of kmer ints where presence > 0
|
||
|
|
"""
|
||
|
|
cmd = [obikmer_bin, 'dump', index_dir]
|
||
|
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
|
||
|
|
text=True)
|
||
|
|
|
||
|
|
header_line = proc.stdout.readline().rstrip('\n')
|
||
|
|
cols = header_line.split(',')
|
||
|
|
specimen_names = cols[1:] # first col is 'kmer'
|
||
|
|
per_specimen: dict[str, list[int]] = {name: [] for name in specimen_names}
|
||
|
|
|
||
|
|
for line in proc.stdout:
|
||
|
|
parts = line.rstrip('\n').split(',')
|
||
|
|
kmer_int = encode_kmer(parts[0])
|
||
|
|
for i, name in enumerate(specimen_names):
|
||
|
|
if int(parts[i + 1]) > 0:
|
||
|
|
per_specimen[name].append(kmer_int)
|
||
|
|
|
||
|
|
proc.wait()
|
||
|
|
if proc.returncode != 0:
|
||
|
|
print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
return specimen_names, per_specimen
|
||
|
|
|
||
|
|
|
||
|
|
# ── per-specimen comparison ───────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def compare_specimen(name: str,
|
||
|
|
kmer_list: list[int],
|
||
|
|
ref_dir: Path,
|
||
|
|
k: int,
|
||
|
|
save_fn: Path | None,
|
||
|
|
save_fp: Path | None,
|
||
|
|
) -> str:
|
||
|
|
"""Compare one specimen column against its reference .npz.
|
||
|
|
|
||
|
|
Returns a CSV row string.
|
||
|
|
"""
|
||
|
|
ref_path = ref_dir / f'{name}.npz'
|
||
|
|
if not ref_path.exists():
|
||
|
|
print(f' SKIP {name}: no reference at {ref_path}', file=sys.stderr)
|
||
|
|
return ''
|
||
|
|
|
||
|
|
species = name.split('--')[0]
|
||
|
|
strain = name[len(species) + 2:]
|
||
|
|
|
||
|
|
ref_kmers = np.load(ref_path)['kmers'] # sorted uint64
|
||
|
|
idx_kmers = np.array(sorted(kmer_list), dtype=np.uint64)
|
||
|
|
|
||
|
|
false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
|
||
|
|
false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)
|
||
|
|
|
||
|
|
fn_pct = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
|
||
|
|
fp_pct = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
|
||
|
|
|
||
|
|
print(f' {name}: ref={len(ref_kmers):,} idx={len(idx_kmers):,} '
|
||
|
|
f'fn={len(false_neg):,} ({fn_pct:.4f}%) '
|
||
|
|
f'fp={len(false_pos):,} ({fp_pct:.4f}%)',
|
||
|
|
file=sys.stderr)
|
||
|
|
|
||
|
|
if save_fn and len(false_neg):
|
||
|
|
fn_file = save_fn / f'{name}_fn.txt'
|
||
|
|
fn_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_neg) + '\n')
|
||
|
|
|
||
|
|
if save_fp and len(false_pos):
|
||
|
|
fp_file = save_fp / f'{name}_fp.txt'
|
||
|
|
fp_file.write_text('\n'.join(decode_kmer(int(v), k) for v in false_pos) + '\n')
|
||
|
|
|
||
|
|
return (f'{species},{strain},'
|
||
|
|
f'{len(ref_kmers)},{len(idx_kmers)},'
|
||
|
|
f'{len(false_neg)},{len(false_pos)},'
|
||
|
|
f'{fn_pct:.4f},{fp_pct:.4f}')
|
||
|
|
|
||
|
|
|
||
|
|
# ── main ─────────────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||
|
|
ap.add_argument('index', metavar='INDEX_DIR', nargs='?',
|
||
|
|
help='Merged presence index directory')
|
||
|
|
ap.add_argument('ref_dir', metavar='REF_DIR', nargs='?',
|
||
|
|
help='Directory containing per-specimen .npz reference files')
|
||
|
|
ap.add_argument('--obikmer', default='obikmer')
|
||
|
|
ap.add_argument('--header', action='store_true',
|
||
|
|
help='Print CSV header and exit')
|
||
|
|
ap.add_argument('--save-fn', metavar='DIR',
|
||
|
|
help='Directory to save false-negative kmer lists')
|
||
|
|
ap.add_argument('--save-fp', metavar='DIR',
|
||
|
|
help='Directory to save false-positive kmer lists')
|
||
|
|
args = ap.parse_args()
|
||
|
|
|
||
|
|
if args.header:
|
||
|
|
print('species,strain,ref_kmers,idx_kmers,'
|
||
|
|
'false_neg,false_pos,fn_pct,fp_pct')
|
||
|
|
return
|
||
|
|
|
||
|
|
ref_dir = Path(args.ref_dir)
|
||
|
|
save_fn = Path(args.save_fn) if args.save_fn else None
|
||
|
|
save_fp = Path(args.save_fp) if args.save_fp else None
|
||
|
|
if save_fn: save_fn.mkdir(parents=True, exist_ok=True)
|
||
|
|
if save_fp: save_fp.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Detect k
|
||
|
|
out1 = subprocess.check_output(
|
||
|
|
[args.obikmer, 'dump', '--head', '1', args.index],
|
||
|
|
stderr=subprocess.DEVNULL, text=True)
|
||
|
|
k = len(out1.splitlines()[1].split(',')[0])
|
||
|
|
|
||
|
|
print(f'k={k} streaming merged dump: {args.index}', file=sys.stderr)
|
||
|
|
specimen_names, per_specimen = stream_merged_dump(args.obikmer, args.index)
|
||
|
|
print(f'{len(specimen_names)} specimen columns loaded', file=sys.stderr)
|
||
|
|
|
||
|
|
for name in specimen_names:
|
||
|
|
row = compare_specimen(name, per_specimen[name], ref_dir, k, save_fn, save_fp)
|
||
|
|
if row:
|
||
|
|
print(row)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
main()
|