refactor: implement RoutableSuperKmer and update k-mer indexing pipeline

Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
2026-04-29 22:52:42 +02:00
parent 4e26e3bd40
commit 27f5e88a7b
72 changed files with 10093 additions and 1626 deletions
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+"""Compare the canonical k-mer sets of two FASTA files.
+
+Reports how many k-mers are shared, exclusive to each file, or missing.
+Handles plain and gzip-compressed FASTA (.gz).
+
+Usage
+-----
+    compare_kmers.py -k 31 file_a.fasta.gz file_b.fasta.gz
+"""
+
+import argparse
+import gzip
+import sys
+from pathlib import Path
+
+COMP = str.maketrans("ACGTacgt", "TGCAtgca")
+
+
+def revcomp(seq: str) -> str:
+    return seq.translate(COMP)[::-1]
+
+
+def canonical(seq: str) -> str:
+    rc = revcomp(seq)
+    return seq if seq <= rc else rc
+
+
+def open_fasta(path: str):
+    p = Path(path)
+    if p.suffix == ".gz":
+        return gzip.open(path, "rt")
+    return open(path, "r")
+
+
+def iter_sequences(path: str):
+    """Yield (header, sequence) pairs from a FASTA file."""
+    header = None
+    parts = []
+    with open_fasta(path) as fh:
+        for line in fh:
+            line = line.rstrip()
+            if line.startswith(">"):
+                if header is not None:
+                    yield header, "".join(parts)
+                header = line[1:]
+                parts = []
+            else:
+                parts.append(line.upper())
+    if header is not None:
+        yield header, "".join(parts)
+
+
+def extract_kmers(path: str, k: int) -> set[str]:
+    """Return the set of canonical k-mers from all sequences in *path*."""
+    kmers: set[str] = set()
+    for _, seq in iter_sequences(path):
+        # skip any character that is not ACGT
+        for i in range(len(seq) - k + 1):
+            kmer = seq[i : i + k]
+            if all(c in "ACGT" for c in kmer):
+                kmers.add(canonical(kmer))
+    return kmers
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compare canonical k-mer sets between two FASTA files."
+    )
+    parser.add_argument("file_a", help="First FASTA file (reference)")
+    parser.add_argument("file_b", help="Second FASTA file (to compare)")
+    parser.add_argument(
+        "-k", "--kmer-size", type=int, default=31, metavar="K", help="k-mer size (default: 31)"
+    )
+    args = parser.parse_args()
+
+    k = args.kmer_size
+    print(f"k = {k}")
+    print(f"A = {args.file_a}")
+    print(f"B = {args.file_b}")
+    print()
+
+    print("reading A …", file=sys.stderr)
+    set_a = extract_kmers(args.file_a, k)
+    print("reading B …", file=sys.stderr)
+    set_b = extract_kmers(args.file_b, k)
+
+    only_a = set_a - set_b
+    only_b = set_b - set_a
+    common = set_a & set_b
+
+    print(f"{'kmers in A':<25} {len(set_a):>12,}")
+    print(f"{'kmers in B':<25} {len(set_b):>12,}")
+    print(f"{'common':<25} {len(common):>12,}")
+    print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
+    print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
+
+    if only_a or only_b:
+        print("\nSets differ.", file=sys.stderr)
+        sys.exit(1)
+    else:
+        print("\nSets are identical.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+# jj_commit_msg.sh — generate a commit message from the current jj change using aichat
+#
+# Usage: jj_commit_msg.sh
+#   Summarises each changed file's diff individually, then combines all
+#   summaries into a single commit message via aichat.
+#
+# Typical use:
+#   jj describe -m "$(jj_commit_msg.sh)"
+
+set -euo pipefail
+
+# Log to stderr so progress doesn't pollute the commit message on stdout
+log()  { printf '\033[1;34m==>\033[0m %s\n' "$*" >&2; }
+info() { printf '    \033[0;37m%s\033[0m\n' "$*" >&2; }
+ok()   { printf '    \033[0;32m✓\033[0m %s\n' "$*" >&2; }
+
+# _readable_diff <file>
+#   Returns a human-readable diff for <file>.
+#   For pathological single-line formats (JSON, minified JS/CSS…), pretty-prints
+#   both the parent and working versions before diffing so the LLM sees
+#   structured changes rather than one enormous ±line.
+_readable_diff() {
+    local file="$1"
+    local raw_diff
+    raw_diff=$(jj diff -- "$file")
+    [[ -z "$raw_diff" ]] && return 0
+
+    # Detect pathological diff: any +/- content line longer than 500 chars
+    local max_len
+    max_len=$(grep '^[+-]' <<< "$raw_diff" | awk '{ if (length > m) m = length } END { print m+0 }')
+
+    if (( max_len <= 500 )); then
+        printf '%s' "$raw_diff"
+        return
+    fi
+
+    # Pretty-print strategy per extension
+    local ext="${file##*.}"
+    local pretty_old pretty_new
+    case "$ext" in
+        json)
+            pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
+            pretty_new=$(jj file show          -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
+            ;;
+        js|mjs|cjs|css|ts)
+            local node_fmt='
+                const chunks = [];
+                process.stdin.on("data", d => chunks.push(d));
+                process.stdin.on("end", () => {
+                    const src = chunks.join("");
+                    // Insert newline before { } ( ) ; and after ,
+                    const out = src
+                        .replace(/([{(])/g,  "$1\n  ")
+                        .replace(/([;}])/g,  "\n$1\n")
+                        .replace(/,\s*/g,    ",\n  ");
+                    process.stdout.write(out);
+                });'
+            pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
+            pretty_new=$(jj file show          -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
+            ;;
+        *)
+            # Generic fallback: fold long lines at 120 chars
+            pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | fold -s -w 120 || true)
+            pretty_new=$(jj file show          -- "$file" 2>/dev/null | fold -s -w 120 || true)
+            ;;
+    esac
+
+    if [[ -n "$pretty_old" && -n "$pretty_new" ]]; then
+        diff <(printf '%s\n' "$pretty_old") <(printf '%s\n' "$pretty_new") \
+            --label "a/${file}" --label "b/${file}" -u || true
+    else
+        printf '%s' "$raw_diff"
+    fi
+}
+
+# Collect changed files in the current working copy change
+changed_files=$(jj diff --name-only)
+
+if [[ -z "$changed_files" ]]; then
+    echo "No changed files." >&2
+    exit 1
+fi
+
+file_count=$(wc -l <<< "$changed_files" | tr -d ' ')
+log "Found $file_count changed file(s)"
+
+summaries=""
+n=0
+
+while IFS= read -r file; do
+    diff=$(_readable_diff "$file")
+    if [[ -z "$diff" ]]; then
+        continue
+    fi
+
+    n=$((n + 1))
+    log "[$n/$file_count] Summarising $file …"
+
+    summary=$(printf '%s' "$diff" | aichat "In 2-3 lines, summarise what this diff changes in the file '$file'. Be concise and technical.")
+
+    # Print the summary indented to stderr
+    while IFS= read -r line; do
+        info "$line"
+    done <<< "$summary"
+
+    summaries+="### $file
+$summary
+
+"
+done <<< "$changed_files"
+
+if [[ -z "$summaries" ]]; then
+    echo "No non-empty diffs found." >&2
+    exit 1
+fi
+
+log "Generating commit message from $n summary/summaries …"
+result=$(printf '%s' "$summaries" | aichat "From these per-file summaries of a jj diff, write a single conventional commit message in English. First line: short imperative summary (max 72 chars). Then a blank line. Then a short paragraph with more detail if needed. Output only the commit message, nothing else.")
+
+ok "Done"
+printf '\n' >&2
+
+# Commit message goes to stdout
+printf '%s\n' "$result"