refactor: implement RoutableSuperKmer and update k-mer indexing pipeline
Replace raw SuperkMer routing with a new RoutableSuperKimer type that embeds canonical sequences and precomputed minimizers, enabling direct partition routing via hash. Update the build pipeline to yield RoutableSuperKmers throughout (builder, scatterer), refactor FASTA/unitig export commands to use the new type and compressed outputs (.fasta.gz, .unitigs.fasta.zst), revise SuperKmer header to store n_kmers instead of seql (avoiding 256-byte wrap), and update documentation to reflect minimizer-based theory, two evidence-encoding strategies for unitig-MPHF indexing (global offset vs. ID+rank), and the new obipipeline library architecture with parallel workers, biased scheduling, and error handling.
This commit is contained in:
Executable
+106
@@ -0,0 +1,106 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Compare the canonical k-mer sets of two FASTA files.
|
||||
|
||||
Reports how many k-mers are shared, exclusive to each file, or missing.
|
||||
Handles plain and gzip-compressed FASTA (.gz).
|
||||
|
||||
Usage
|
||||
-----
|
||||
compare_kmers.py -k 31 file_a.fasta.gz file_b.fasta.gz
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
COMP = str.maketrans("ACGTacgt", "TGCAtgca")
|
||||
|
||||
|
||||
def revcomp(seq: str) -> str:
|
||||
return seq.translate(COMP)[::-1]
|
||||
|
||||
|
||||
def canonical(seq: str) -> str:
|
||||
rc = revcomp(seq)
|
||||
return seq if seq <= rc else rc
|
||||
|
||||
|
||||
def open_fasta(path: str):
|
||||
p = Path(path)
|
||||
if p.suffix == ".gz":
|
||||
return gzip.open(path, "rt")
|
||||
return open(path, "r")
|
||||
|
||||
|
||||
def iter_sequences(path: str):
|
||||
"""Yield (header, sequence) pairs from a FASTA file."""
|
||||
header = None
|
||||
parts = []
|
||||
with open_fasta(path) as fh:
|
||||
for line in fh:
|
||||
line = line.rstrip()
|
||||
if line.startswith(">"):
|
||||
if header is not None:
|
||||
yield header, "".join(parts)
|
||||
header = line[1:]
|
||||
parts = []
|
||||
else:
|
||||
parts.append(line.upper())
|
||||
if header is not None:
|
||||
yield header, "".join(parts)
|
||||
|
||||
|
||||
def extract_kmers(path: str, k: int) -> set[str]:
|
||||
"""Return the set of canonical k-mers from all sequences in *path*."""
|
||||
kmers: set[str] = set()
|
||||
for _, seq in iter_sequences(path):
|
||||
# skip any character that is not ACGT
|
||||
for i in range(len(seq) - k + 1):
|
||||
kmer = seq[i : i + k]
|
||||
if all(c in "ACGT" for c in kmer):
|
||||
kmers.add(canonical(kmer))
|
||||
return kmers
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Compare canonical k-mer sets between two FASTA files."
|
||||
)
|
||||
parser.add_argument("file_a", help="First FASTA file (reference)")
|
||||
parser.add_argument("file_b", help="Second FASTA file (to compare)")
|
||||
parser.add_argument(
|
||||
"-k", "--kmer-size", type=int, default=31, metavar="K", help="k-mer size (default: 31)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
k = args.kmer_size
|
||||
print(f"k = {k}")
|
||||
print(f"A = {args.file_a}")
|
||||
print(f"B = {args.file_b}")
|
||||
print()
|
||||
|
||||
print("reading A …", file=sys.stderr)
|
||||
set_a = extract_kmers(args.file_a, k)
|
||||
print("reading B …", file=sys.stderr)
|
||||
set_b = extract_kmers(args.file_b, k)
|
||||
|
||||
only_a = set_a - set_b
|
||||
only_b = set_b - set_a
|
||||
common = set_a & set_b
|
||||
|
||||
print(f"{'kmers in A':<25} {len(set_a):>12,}")
|
||||
print(f"{'kmers in B':<25} {len(set_b):>12,}")
|
||||
print(f"{'common':<25} {len(common):>12,}")
|
||||
print(f"{'only in A (lost)':<25} {len(only_a):>12,}")
|
||||
print(f"{'only in B (gained)':<25} {len(only_b):>12,}")
|
||||
|
||||
if only_a or only_b:
|
||||
print("\nSets differ.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
print("\nSets are identical.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+125
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env bash
|
||||
# jj_commit_msg.sh — generate a commit message from the current jj change using aichat
|
||||
#
|
||||
# Usage: jj_commit_msg.sh
|
||||
# Summarises each changed file's diff individually, then combines all
|
||||
# summaries into a single commit message via aichat.
|
||||
#
|
||||
# Typical use:
|
||||
# jj describe -m "$(jj_commit_msg.sh)"
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# Log to stderr so progress doesn't pollute the commit message on stdout
|
||||
log() { printf '\033[1;34m==>\033[0m %s\n' "$*" >&2; }
|
||||
info() { printf ' \033[0;37m%s\033[0m\n' "$*" >&2; }
|
||||
ok() { printf ' \033[0;32m✓\033[0m %s\n' "$*" >&2; }
|
||||
|
||||
# _readable_diff <file>
|
||||
# Returns a human-readable diff for <file>.
|
||||
# For pathological single-line formats (JSON, minified JS/CSS…), pretty-prints
|
||||
# both the parent and working versions before diffing so the LLM sees
|
||||
# structured changes rather than one enormous ±line.
|
||||
_readable_diff() {
|
||||
local file="$1"
|
||||
local raw_diff
|
||||
raw_diff=$(jj diff -- "$file")
|
||||
[[ -z "$raw_diff" ]] && return 0
|
||||
|
||||
# Detect pathological diff: any +/- content line longer than 500 chars
|
||||
local max_len
|
||||
max_len=$(grep '^[+-]' <<< "$raw_diff" | awk '{ if (length > m) m = length } END { print m+0 }')
|
||||
|
||||
if (( max_len <= 500 )); then
|
||||
printf '%s' "$raw_diff"
|
||||
return
|
||||
fi
|
||||
|
||||
# Pretty-print strategy per extension
|
||||
local ext="${file##*.}"
|
||||
local pretty_old pretty_new
|
||||
case "$ext" in
|
||||
json)
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
|
||||
;;
|
||||
js|mjs|cjs|css|ts)
|
||||
local node_fmt='
|
||||
const chunks = [];
|
||||
process.stdin.on("data", d => chunks.push(d));
|
||||
process.stdin.on("end", () => {
|
||||
const src = chunks.join("");
|
||||
// Insert newline before { } ( ) ; and after ,
|
||||
const out = src
|
||||
.replace(/([{(])/g, "$1\n ")
|
||||
.replace(/([;}])/g, "\n$1\n")
|
||||
.replace(/,\s*/g, ",\n ");
|
||||
process.stdout.write(out);
|
||||
});'
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
|
||||
;;
|
||||
*)
|
||||
# Generic fallback: fold long lines at 120 chars
|
||||
pretty_old=$(jj file show -r @- -- "$file" 2>/dev/null | fold -s -w 120 || true)
|
||||
pretty_new=$(jj file show -- "$file" 2>/dev/null | fold -s -w 120 || true)
|
||||
;;
|
||||
esac
|
||||
|
||||
if [[ -n "$pretty_old" && -n "$pretty_new" ]]; then
|
||||
diff <(printf '%s\n' "$pretty_old") <(printf '%s\n' "$pretty_new") \
|
||||
--label "a/${file}" --label "b/${file}" -u || true
|
||||
else
|
||||
printf '%s' "$raw_diff"
|
||||
fi
|
||||
}
|
||||
|
||||
# Collect changed files in the current working copy change
|
||||
changed_files=$(jj diff --name-only)
|
||||
|
||||
if [[ -z "$changed_files" ]]; then
|
||||
echo "No changed files." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
file_count=$(wc -l <<< "$changed_files" | tr -d ' ')
|
||||
log "Found $file_count changed file(s)"
|
||||
|
||||
summaries=""
|
||||
n=0
|
||||
|
||||
while IFS= read -r file; do
|
||||
diff=$(_readable_diff "$file")
|
||||
if [[ -z "$diff" ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
n=$((n + 1))
|
||||
log "[$n/$file_count] Summarising $file …"
|
||||
|
||||
summary=$(printf '%s' "$diff" | aichat "In 2-3 lines, summarise what this diff changes in the file '$file'. Be concise and technical.")
|
||||
|
||||
# Print the summary indented to stderr
|
||||
while IFS= read -r line; do
|
||||
info "$line"
|
||||
done <<< "$summary"
|
||||
|
||||
summaries+="### $file
|
||||
$summary
|
||||
|
||||
"
|
||||
done <<< "$changed_files"
|
||||
|
||||
if [[ -z "$summaries" ]]; then
|
||||
echo "No non-empty diffs found." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
log "Generating commit message from $n summary/summaries …"
|
||||
result=$(printf '%s' "$summaries" | aichat "From these per-file summaries of a jj diff, write a single conventional commit message in English. First line: short imperative summary (max 72 chars). Then a blank line. Then a short paragraph with more detail if needed. Output only the commit message, nothing else.")
|
||||
|
||||
ok "Done"
|
||||
printf '\n' >&2
|
||||
|
||||
# Commit message goes to stdout
|
||||
printf '%s\n' "$result"
|
||||
Reference in New Issue
Block a user