#!/usr/bin/env python3
"""Compare an obikmer count index against a reference kmer set (presence + counts).

Loads the reference .npz (sorted uint64 kmers + uint32 counts from build_reference.py),
streams `obikmer dump` from a --with-counts index, then reports:
  - false negatives : kmers in reference absent from the index
  - false positives : kmers in the index absent from the reference
  - count mismatches: kmers present in both but with differing counts

Output to stdout: one CSV row
  species,strain,ref_kmers,idx_kmers,false_neg,false_pos,count_mismatch,
  fn_pct,fp_pct,cm_pct
"""
import argparse
import subprocess
import sys

import numpy as np


# ── encoding ──────────────────────────────────────────────────────────────────

_ENCODE = {'A': 0, 'C': 1, 'G': 2, 'T': 3,
           'a': 0, 'c': 1, 'g': 2, 't': 3}

_DECODE = ['A', 'C', 'G', 'T']


def encode_kmer(s: str) -> int:
    kmer = 0
    for c in s:
        kmer = (kmer << 2) | _ENCODE[c]
    return kmer


def decode_kmer(val: int, k: int) -> str:
    bases = []
    for _ in range(k):
        bases.append(_DECODE[val & 3])
        val >>= 2
    return ''.join(reversed(bases))


# ── dump parsing ──────────────────────────────────────────────────────────────

def load_index(obikmer_bin: str, index_dir: str) -> tuple[np.ndarray, np.ndarray]:
    """Stream `obikmer dump` and return (kmers_sorted_uint64, counts_uint32)."""
    cmd = [obikmer_bin, 'dump', index_dir]
    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
                            text=True)
    kmers, counts = [], []
    header = True
    for line in proc.stdout:
        if header:
            header = False
            continue
        parts = line.rstrip('\n').split(',')
        kmers.append(encode_kmer(parts[0]))
        counts.append(int(parts[1]))
    proc.wait()
    if proc.returncode != 0:
        print(f'ERROR: obikmer dump exited {proc.returncode}', file=sys.stderr)
        sys.exit(1)
    order = np.argsort(np.array(kmers, dtype=np.uint64), kind='stable')
    return (np.array(kmers, dtype=np.uint64)[order],
            np.array(counts, dtype=np.uint32)[order])


# ── comparison ────────────────────────────────────────────────────────────────

def compare(ref_kmers: np.ndarray, ref_counts: np.ndarray,
            idx_kmers: np.ndarray, idx_counts: np.ndarray,
            ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """Return (false_neg, false_pos, cm_ref_kmers, cm_ref_counts, cm_idx_counts).

    All arrays sorted; cm_* cover kmers present in both arrays but with
    differing counts.
    """
    false_neg = np.setdiff1d(ref_kmers, idx_kmers, assume_unique=True)
    false_pos = np.setdiff1d(idx_kmers, ref_kmers, assume_unique=True)

    # Count mismatches among shared kmers.
    # Both arrays are sorted so we can use searchsorted.
    pos_in_idx = np.searchsorted(idx_kmers, ref_kmers)
    pos_in_idx = np.clip(pos_in_idx, 0, len(idx_kmers) - 1)
    shared_mask = idx_kmers[pos_in_idx] == ref_kmers

    shared_ref_counts = ref_counts[shared_mask]
    shared_idx_counts = idx_counts[pos_in_idx[shared_mask]]
    mismatch_mask     = shared_ref_counts != shared_idx_counts

    cm_kmers      = ref_kmers[shared_mask][mismatch_mask]
    cm_ref_counts = shared_ref_counts[mismatch_mask]
    cm_idx_counts = shared_idx_counts[mismatch_mask]

    return false_neg, false_pos, cm_kmers, cm_ref_counts, cm_idx_counts


# ── main ─────────────────────────────────────────────────────────────────────

def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument('reference',  metavar='REF_NPZ',   nargs='?',
                    help='Reference .npz file')
    ap.add_argument('index',      metavar='INDEX_DIR', nargs='?',
                    help='obikmer index directory (built with --with-counts)')
    ap.add_argument('--obikmer',  default='obikmer',
                    help='Path to obikmer binary')
    ap.add_argument('--species',  default='')
    ap.add_argument('--strain',   default='')
    ap.add_argument('--header',   action='store_true',
                    help='Print CSV header and exit')
    ap.add_argument('--save-fp',  metavar='FILE',
                    help='Save false-positive kmer strings to FILE')
    ap.add_argument('--save-fn',  metavar='FILE',
                    help='Save false-negative kmer strings to FILE')
    ap.add_argument('--save-cm',  metavar='FILE',
                    help='Save count-mismatch rows (kmer,ref_count,idx_count) to FILE')
    args = ap.parse_args()

    if args.header:
        print('species,strain,ref_kmers,idx_kmers,'
              'false_neg,false_pos,count_mismatch,'
              'fn_pct,fp_pct,cm_pct')
        return

    # Detect k
    cmd1 = [args.obikmer, 'dump', '--head', '1', args.index]
    out1 = subprocess.check_output(cmd1, stderr=subprocess.DEVNULL, text=True)
    k = len(out1.splitlines()[1].split(',')[0])

    # Load reference
    print(f'Loading reference: {args.reference}', file=sys.stderr)
    npz = np.load(args.reference)
    ref_kmers  = npz['kmers']    # sorted uint64
    ref_counts = npz['counts']   # uint32

    # Load index
    print(f'Streaming dump (k={k}): {args.index}', file=sys.stderr)
    idx_kmers, idx_counts = load_index(args.obikmer, args.index)

    print(f'k={k}  ref={len(ref_kmers):,}  idx={len(idx_kmers):,}', file=sys.stderr)

    false_neg, false_pos, cm_kmers, cm_ref, cm_idx = compare(
        ref_kmers, ref_counts, idx_kmers, idx_counts)

    n_shared  = len(ref_kmers) - len(false_neg)
    fn_pct    = 100.0 * len(false_neg) / len(ref_kmers) if len(ref_kmers) else 0.0
    fp_pct    = 100.0 * len(false_pos) / len(idx_kmers) if len(idx_kmers) else 0.0
    cm_pct    = 100.0 * len(cm_kmers)  / n_shared       if n_shared        else 0.0

    print(f'false negatives : {len(false_neg):,}  ({fn_pct:.4f}%)', file=sys.stderr)
    print(f'false positives : {len(false_pos):,}  ({fp_pct:.4f}%)', file=sys.stderr)
    print(f'count mismatches: {len(cm_kmers):,}  ({cm_pct:.4f}% of shared)',
          file=sys.stderr)

    if args.save_fn and len(false_neg):
        with open(args.save_fn, 'w') as fh:
            for v in false_neg:
                fh.write(decode_kmer(int(v), k) + '\n')

    if args.save_fp and len(false_pos):
        with open(args.save_fp, 'w') as fh:
            for v in false_pos:
                fh.write(decode_kmer(int(v), k) + '\n')

    if args.save_cm and len(cm_kmers):
        with open(args.save_cm, 'w') as fh:
            fh.write('kmer,ref_count,idx_count\n')
            for v, rc, ic in zip(cm_kmers, cm_ref, cm_idx):
                fh.write(f'{decode_kmer(int(v), k)},{rc},{ic}\n')

    print(f'{args.species},{args.strain},'
          f'{len(ref_kmers)},{len(idx_kmers)},'
          f'{len(false_neg)},{len(false_pos)},{len(cm_kmers)},'
          f'{fn_pct:.4f},{fp_pct:.4f},{cm_pct:.4f}')


if __name__ == '__main__':
    main()