fix: strip AI reasoning blocks from commit messages

Adds a `_strip_think` function using `awk` to buffer stdin and track the last `</think>` tag, emitting only the subsequent content. This utility is now piped after `aichat` calls to remove AI reasoning blocks before commit message generation. Also applies minor whitespace and indentation adjustments throughout the script.
2026-05-02 18:01:31 +02:00
parent 0b784242cf
commit 602f414957
4 changed files with 65 additions and 33 deletions
@@ -5,7 +5,7 @@
 #   Summarises each changed file's diff individually, then combines all
 #   summaries into a single commit message via aichat.
 #   REV defaults to `@` (current working copy). Accepts any jj revision:
-#   `@-`, `lk`, a commit ID, a branch name, etc.
+#    `@-`, `lk`, a commit ID, a branch name, etc.
 #
 # Typical use:
 #   jj describe -m "$(jj_commit_msg.sh)"
@@ -18,9 +18,24 @@ set -euo pipefail
 REV="${1:-@}"

 # Log to stderr so progress doesn't pollute the commit message on stdout
-log()  { printf '\033[1;34m==>\033[0m %s\n' "$*" >&2; }
-info() { printf '    \033[0;37m%s\033[0m\n' "$*" >&2; }
-ok()   { printf '    \033[0;32m✓\033[0m %s\n' "$*" >&2; }
+log()    { printf '\033[1;34m==>\033[0m %s\n' "$*" >&2; }
+info() { printf '      \033[0;37m%s\033[0m\n' "$*" >&2; }
+ok()     { printf '      \033[0;32m✓\033[0m %s\n' "$*" >&2; }
+
+# _strip_think — remove reasoning tags from stdin
+# Buffer all input, locate the LAST </think> line, emit only what follows.
+# This handles think blocks that themselves contain </think> fragments.
+_strip_think() {
+    awk '{
+        lines[NR] = $0
+        print $0 > "/dev/stderr"
+        if (/^<\/think>/) last_end = NR
+    }
+    END {
+        start = (last_end ? last_end + 1 : 1)
+        for (i = start; i <= NR; i++) print lines[i]
+    }'
+}

 # _readable_diff <file>
 #   Returns a human-readable diff for <file>.
@@ -31,9 +46,9 @@ _readable_diff() {
    local file="$1"
    local raw_diff
    raw_diff=$(jj diff -r "$REV" -- "$file")
-    [[ -z "$raw_diff" ]] && return 0
+     [[ -z "$raw_diff" ]] && return 0

-    # Detect pathological diff: any +/- content line longer than 500 chars
+      # Detect pathological diff: any +/- content line longer than 500 chars
    local max_len
    max_len=$(grep '^[+-]' <<< "$raw_diff" | awk '{ if (length > m) m = length } END { print m+0 }')

@@ -42,40 +57,40 @@ _readable_diff() {
        return
    fi

-    # Pretty-print strategy per extension
+      # Pretty-print strategy per extension
    local ext="${file##*.}"
    local pretty_old pretty_new
    case "$ext" in
        json)
            pretty_old=$(jj file show -r "$REV@-" -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
            pretty_new=$(jj file show -r "$REV" -- "$file" 2>/dev/null | python3 -m json.tool 2>/dev/null || true)
-            ;;
+              ;;
        js|mjs|cjs|css|ts)
            local node_fmt='
                const chunks = [];
                process.stdin.on("data", d => chunks.push(d));
                process.stdin.on("end", () => {
                    const src = chunks.join("");
-                    // Insert newline before { } ( ) ; and after ,
+                      // Insert newline before { } ( ) ; and after ,
                    const out = src
-                        .replace(/([{(])/g,  "$1\n  ")
-                        .replace(/([;}])/g,  "\n$1\n")
-                        .replace(/,\s*/g,    ",\n  ");
+                          .replace(/([{(])/g,    "$1\n    ")
+                          .replace(/([;}])/g,    "\n$1\n")
+                          .replace(/,\s*/g,      ",\n    ");
                    process.stdout.write(out);
-                });'
+                  });'
            pretty_old=$(jj file show -r "$REV@-" -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
            pretty_new=$(jj file show -r "$REV" -- "$file" 2>/dev/null | node -e "$node_fmt" 2>/dev/null || true)
-            ;;
-        *)
-            # Generic fallback: fold long lines at 120 chars
+              ;;
+          *)
+              # Generic fallback: fold long lines at 120 chars
            pretty_old=$(jj file show -r "$REV@-" -- "$file" 2>/dev/null | fold -s -w 120 || true)
            pretty_new=$(jj file show -r "$REV" -- "$file" 2>/dev/null | fold -s -w 120 || true)
-            ;;
+              ;;
    esac

    if [[ -n "$pretty_old" && -n "$pretty_new" ]]; then
        diff <(printf '%s\n' "$pretty_old") <(printf '%s\n' "$pretty_new") \
-            --label "a/${file}" --label "b/${file}" -u || true
+              --label "a/${file}" --label "b/${file}" -u || true
    else
        printf '%s' "$raw_diff"
    fi
@@ -104,9 +119,9 @@ while IFS= read -r file; do
    n=$((n + 1))
    log "[$n/$file_count] Summarising $file …"

-    summary=$(printf '%s' "$diff" | aichat "In 2-3 lines, summarise what this diff changes in the file '$file'. Be concise and technical.")
+    summary=$(printf '%s' "$diff" | aichat "In 2-3 lines, summarise what this diff changes in the file '$file'. Be concise and technical." | _strip_think)

-    # Print the summary indented to stderr
+      # Print the summary indented to stderr
    while IFS= read -r line; do
        info "$line"
    done <<< "$summary"
@@ -123,10 +138,10 @@ if [[ -z "$summaries" ]]; then
 fi

 log "Generating commit message from $n summary/summaries …"
-result=$(printf '%s' "$summaries" | aichat "From these per-file summaries of a jj diff, write a single conventional commit message in English. First line: short imperative summary (max 72 chars). Then a blank line. Then a short paragraph with more detail if needed. Output only the commit message, nothing else.")
+result=$(printf '%s' "$summaries" | aichat "From these per-file summaries of a jj diff, write a single conventional commit message in English. First line: short imperative summary (max 72 chars). Then a blank line. Then a short paragraph with more detail if needed. Output only the commit message, nothing else." | _strip_think)

 ok "Done"
 printf '\n' >&2

-# Commit message goes to stdout
-printf '%s\n' "$result"
+# Commit message goes to stdout (strip leading blank lines so jj sees content)
+printf '%s\n' "$result" | sed '/./,$!d'
@@ -12,6 +12,7 @@ pub mod kmer;
 mod revcomp_lookup;
 /// Routable super-kmer: canonical sequence paired with its minimizer for scatter routing.
 pub mod routable;
+mod sequence;
 pub mod superkmer;

 pub mod unitig;
@@ -19,4 +20,5 @@ pub mod unitig;
 pub use annotations::Annotation;
 pub use kmer::CanonicalKmer;
 pub use routable::RoutableSuperKmer;
+pub use sequence::Sequence;
 pub use superkmer::SuperKmer;
@@ -1,5 +1,8 @@
+use crate::Annotation;
+
 pub trait Sequence {
-    fn sequence(&self) -> &[u8];
-    fn canonical(&self) -> Self;
+    fn sequence(&self) -> Box<[u8]>;
+    fn canonical(&self) -> &Self;
    fn seq_hash(&self) -> u64;
+    fn annotation(&self) -> Annotation;
 }
@@ -1,13 +1,14 @@
 //! Compact 2-bit DNA super-kmer with in-place reverse complement and canonical form.
 use std::io::{self, Write};

+use bitvec::prelude::*;
 use serde::Serialize;
+use xxhash_rust::xxh3::xxh3_64;

+use crate::Sequence;
 use crate::encoding::{DEC4, encode_base};
 use crate::kmer::{CanonicalKmer, Kmer, KmerError};
 use crate::revcomp_lookup::REVCOMP4;
-use bitvec::prelude::*;
-use xxhash_rust::xxh3::xxh3_64;

 // ── SuperKmerHeader ───────────────────────────────────────────────────────────

@@ -53,7 +54,7 @@ impl SuperKmerHeader {
 }

 #[derive(Serialize)]
-struct CountAnnotation {
+struct SKAnnotation {
    seq_length: usize,
    kmer_size: usize,
    minimizer_size: usize,
@@ -90,6 +91,22 @@ impl std::hash::Hash for SuperKmer {
    }
 }

+impl Sequence for SuperKmer {
+    fn sequence(&self) -> Box<[u8]> {
+        self.seq.clone()
+    }
+
+    fn canonical(&self) -> &Self {
+        &self
+    }
+
+    /// Returns the XXH3-64 hash of the packed sequence bytes.
+    fn seq_hash(&self) -> u64 {
+        xxh3_64(&self.seq)
+    }
+
+    fn annotation(&self) -> Annotation {}
+}
 impl SuperKmer {
    /// `seql` is the raw stored byte: 1–255 for lengths 1–255, 0 for length 256.
    pub fn new(seql: u8, seq: Box<[u8]>) -> Self {
@@ -315,11 +332,6 @@ impl SuperKmer {
    pub fn iter_canonical_kmers(&self, k: usize) -> impl Iterator<Item = CanonicalKmer> + '_ {
        self.iter_kmers(k).map(move |km| km.canonical(k))
    }
-
-    /// Returns the XXH3-64 hash of the packed sequence bytes.
-    pub fn seq_hash(&self) -> u64 {
-        xxh3_64(&self.seq)
-    }
 }

 struct SKKmerIter<'a> {