fix(llm): make FTS eval judge parser tolerant of real-world model output

cpcloud · cpcloud · commit 768bb7ffaf37 · 2026-04-20T07:28:23.000-04:00
The original parser rejected everything except the literal format
`C1=&lt;digit&gt; C2=&lt;digit&gt; ...`. Qwen3, DeepSeek-R1, and similar
reasoning models emit &lt;think&gt;...&lt;/think&gt; preambles and decorate
output with markdown (`**C1** = 1`, `- C1: 1`, `**Reason:**`). Under
the old parser every one of those replies fell to the
"unparseable" branch and the table report showed judge=- with no
explanation surfaced.

Parser now:
- Strips &lt;think&gt;/&lt;thinking&gt;/&lt;reasoning&gt; blocks before matching.
- Tolerates C1=1 / C1: 1 / **C1** = 1 / - C1: 1 / mixed case.
- Accepts "Rationale:" as an alternative to "Reason:".
- First match per criterion wins, so a model restating the
  criteria list can't override its later verdict.

Reported judge_reason also surfaces in the table report's Notes
column when JudgeScore == -1, so a judge failure explains itself
without requiring --format json.

Tests cover each tolerated variant plus the case where only
partial scores are present (still returns the -1 sentinel).
diff --git a/internal/ftseval/harness.go b/internal/ftseval/harness.go
@@ -6,6 +6,8 @@ package ftseval
 import (
 	"context"
 	"fmt"
+	"regexp"
+	"strconv"
 	"strings"
 	"time"
 
@@ -264,7 +266,7 @@ func (r *runner) applyEntityHit(arm *ArmResult, q Question, ftsContext string) {
 }
 
 // runJudge makes one LLM call asking the judge model to score C1..C5 on a
-// 0/1 each, summed to 0-5. The reply is parsed by looking for digits.
+// 0/1 each, summed to 0-5.
 func (r *runner) runJudge(ctx context.Context, arm *ArmResult, q Question) {
 	judgePrompt := fmt.Sprintf(
 		`You are grading an AI assistant's answer to a user's question about their home-management database.
@@ -278,9 +280,9 @@ C5: Is the answer free of irrelevant content?
 
 Extra context for grading: %s
 
-Reply in EXACTLY this format, with no other prose:
-C1=<0|1> C2=<0|1> C3=<0|1> C4=<0|1> C5=<0|1>
-Reason: <one short sentence>`,
+Reply with each score on its own line in the format "Cn=0" or "Cn=1"
+(n from 1 to 5), followed by a single line "Reason: <one short sentence>".
+Think first if you want, but the grading lines must be present.`,
 		q.JudgePrompt,
 	)
 
@@ -298,51 +300,97 @@ Reason: <one short sentence>`,
 	}
 	score, reason := parseJudgeReply(reply)
 	if score < 0 {
-		arm.Grade.JudgeReason = "judge reply unparseable: " + strings.TrimSpace(reply)
+		arm.Grade.JudgeReason = "judge_parse_failed: " + truncateReply(reply)
 		return
 	}
 	arm.Grade.JudgeScore = score
 	arm.Grade.JudgeReason = reason
 }
 
-// parseJudgeReply extracts "C1=0 C2=1 ..." and the reason line. Returns
-// score=-1 when the expected pattern is missing.
+// judgeScoreRE matches one criterion line. Tolerates:
+//   - separator variants: `=`, `:`, `  =  `, ` : `
+//   - markdown decoration around the criterion name or score:
+//     `**C1**=1`, `- C1 = 1`, `**C1**=**1**`
+//   - case: `c1`, `C1`
+//
+// The `\** \s*` pair on each side of the separator absorbs `**`
+// (markdown bold), leading/trailing whitespace, and stray underscores
+// common in model output.
+var judgeScoreRE = regexp.MustCompile(`(?i)\bc\s*([1-5])\s*\**\s*[:=]\s*\**\s*([01])\b`)
+
+// judgeReasonRE matches the reason line. Accepts "Reason:", "reason =",
+// "Rationale:" etc. (?i) for case; (?s) deliberately NOT set so the
+// capture stops at the first newline.
+var judgeReasonRE = regexp.MustCompile(`(?i)\b(?:reason|rationale)\b\s*[:=]\s*(.+)`)
+
+// judgeThinkREs strip reasoning-model preambles. Qwen3, DeepSeek-R1,
+// and similar models emit `<think>…</think>`, `<thinking>…</thinking>`,
+// or `<reasoning>…</reasoning>` before the actual answer. RE2 has no
+// backreferences, so we keep one regex per tag name.
+var judgeThinkREs = []*regexp.Regexp{
+	regexp.MustCompile(`(?is)<think>.*?</think>`),
+	regexp.MustCompile(`(?is)<thinking>.*?</thinking>`),
+	regexp.MustCompile(`(?is)<reasoning>.*?</reasoning>`),
+}
+
+// parseJudgeReply extracts C1..C5 scores and an optional reason from a
+// judge-model reply. Returns (-1, "") when any of the five criteria are
+// missing. Tolerant of formatting variation: reasoning preambles,
+// markdown decoration, `:` vs `=` separators, mixed case, and stray
+// whitespace all work.
 func parseJudgeReply(reply string) (int, string) {
-	score := 0
-	var found int
-	for i := 1; i <= 5; i++ {
-		tag := fmt.Sprintf("C%d=", i)
-		idx := strings.Index(reply, tag)
-		if idx < 0 {
-			return -1, ""
+	for _, re := range judgeThinkREs {
+		reply = re.ReplaceAllString(reply, "")
+	}
+
+	matches := judgeScoreRE.FindAllStringSubmatch(reply, -1)
+	byCriterion := map[int]int{}
+	for _, m := range matches {
+		idx, err := strconv.Atoi(m[1])
+		if err != nil || idx < 1 || idx > 5 {
+			continue
 		}
-		after := reply[idx+len(tag):]
-		if len(after) == 0 {
-			return -1, ""
+		val, err := strconv.Atoi(m[2])
+		if err != nil || (val != 0 && val != 1) {
+			continue
 		}
-		switch after[0] {
-		case '1':
-			score++
-			found++
-		case '0':
-			found++
-		default:
-			return -1, ""
+		// First match wins so a model restating the criteria mid-reply
+		// can't override its final verdict.
+		if _, seen := byCriterion[idx]; !seen {
+			byCriterion[idx] = val
 		}
 	}
-	if found != 5 {
+	if len(byCriterion) != 5 {
 		return -1, ""
 	}
+	score := 0
+	for i := 1; i <= 5; i++ {
+		score += byCriterion[i]
+	}
+
 	reason := ""
-	if idx := strings.Index(reply, "Reason:"); idx >= 0 {
-		reason = strings.TrimSpace(reply[idx+len("Reason:"):])
-		if nl := strings.IndexByte(reason, '\n'); nl >= 0 {
-			reason = reason[:nl]
-		}
+	if m := judgeReasonRE.FindStringSubmatch(reply); len(m) >= 2 {
+		// Strip markdown decoration (** _ *) and whitespace so "Reason:
+		// **mostly there**" normalizes to "mostly there".
+		reason = strings.Trim(m[1], " \t*_")
 	}
 	return score, reason
 }
 
+// truncateReply keeps the last chunk of a judge reply so parse-failure
+// reasons surface something recognizable in the report without flooding
+// it. Prefers the tail because reasoning models put the verdict at the
+// end; the leading <think> block is rarely informative about why the
+// parse failed.
+func truncateReply(reply string) string {
+	reply = strings.TrimSpace(reply)
+	const maxLen = 200
+	if len(reply) <= maxLen {
+		return reply
+	}
+	return "..." + reply[len(reply)-maxLen:]
+}
+
 // ExitCode derives a process exit code from the results under cfg.Strict.
 // Returns 0 when nothing regressed and 1 on a per-question rubric
 // regression (FTS-on rubric strictly less than FTS-off rubric) among
diff --git a/internal/ftseval/harness_test.go b/internal/ftseval/harness_test.go
@@ -5,6 +5,8 @@ package ftseval
 
 import (
 	"bytes"
+	"strconv"
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
@@ -26,10 +28,96 @@ func TestParseJudgeReplyMissingTag(t *testing.T) {
 
 func TestParseJudgeReplyInvalidDigit(t *testing.T) {
 	t.Parallel()
+	// 9 isn't a valid criterion score; the regex rejects it via [01].
+	// The remaining four criteria aren't enough to reach 5, so sentinel.
 	score, _ := parseJudgeReply("C1=9 C2=1 C3=0 C4=1 C5=1")
 	assert.Equal(t, -1, score, "non-binary digit must produce sentinel -1")
 }
 
+func TestParseJudgeReplyColonSeparator(t *testing.T) {
+	t.Parallel()
+	score, reason := parseJudgeReply("C1: 1\nC2: 1\nC3: 0\nC4: 1\nC5: 0\nReason: mostly there")
+	assert.Equal(t, 3, score)
+	assert.Equal(t, "mostly there", reason)
+}
+
+func TestParseJudgeReplyMarkdownDecoration(t *testing.T) {
+	t.Parallel()
+	score, reason := parseJudgeReply(`- **C1** = 1
+- **C2** = 0
+- **C3** = 1
+- **C4** = 1
+- **C5** = 0
+
+**Reason:** entity naming was off`)
+	assert.Equal(t, 3, score)
+	assert.Equal(t, "entity naming was off", reason)
+}
+
+func TestParseJudgeReplyStripsThinkBlock(t *testing.T) {
+	t.Parallel()
+	// qwen3 / deepseek-r1 emit <think>...</think> preambles. The
+	// parser must ignore them even when they contain strings that
+	// look like scores (the model might analyze "C1").
+	reply := `<think>
+Let me analyze C1: the answer addresses the question. I should give it 1.
+Similarly C2 through C5... I'll check each.
+</think>
+
+C1=1 C2=1 C3=1 C4=0 C5=1
+Reason: SQL was awkward but correct`
+	score, reason := parseJudgeReply(reply)
+	assert.Equal(t, 4, score)
+	assert.Equal(t, "SQL was awkward but correct", reason)
+}
+
+func TestParseJudgeReplyFirstMatchWins(t *testing.T) {
+	t.Parallel()
+	// When a model restates the criteria list before scoring (common
+	// pattern), the SECOND mention is the verdict. The parser should
+	// take the first match per criterion instead — restated criteria
+	// often come with no score attached.
+	reply := `Restating criteria: C1: ?, C2: ?, C3: ?, C4: ?, C5: ?
+Grading:
+C1=1 C2=1 C3=0 C4=0 C5=1
+Reason: partial credit`
+	score, _ := parseJudgeReply(reply)
+	// The "?" lines don't match [01] so they're ignored; only the
+	// grading lines count. Score = 1+1+0+0+1 = 3.
+	assert.Equal(t, 3, score)
+}
+
+func TestParseJudgeReplyRationaleKeyword(t *testing.T) {
+	t.Parallel()
+	score, reason := parseJudgeReply("C1=1 C2=1 C3=1 C4=1 C5=1\nRationale: perfect")
+	assert.Equal(t, 5, score)
+	assert.Equal(t, "perfect", reason)
+}
+
+func TestParseJudgeReplyWithReasonButNoScores(t *testing.T) {
+	t.Parallel()
+	// Only partial scores — must be treated as unparseable even
+	// though a reason is present.
+	score, reason := parseJudgeReply("C1=1 C2=1\nReason: incomplete")
+	assert.Equal(t, -1, score)
+	assert.Equal(t, "", reason)
+}
+
+func TestTruncateReplyShort(t *testing.T) {
+	t.Parallel()
+	assert.Equal(t, "short reply", truncateReply("  short reply  "))
+}
+
+func TestTruncateReplyLong(t *testing.T) {
+	t.Parallel()
+	long := strings.Repeat("x", 250) + "THE VERDICT"
+	got := truncateReply(long)
+	assert.True(t, strings.HasSuffix(got, "THE VERDICT"),
+		"truncation must keep the tail (where the verdict lives)")
+	assert.True(t, strings.HasPrefix(got, "..."),
+		"truncation must mark the leading cut")
+}
+
 func TestStripFences(t *testing.T) {
 	t.Parallel()
 	cases := []struct{ in, want string }{
@@ -143,3 +231,110 @@ Vendor "Pacific" (id: 01JY)`
 	assert.Equal(t, 2, arm.Grade.EntitiesTotal,
 		"only non-empty expected IDs count toward totals")
 }
+
+// sampleResults returns a minimal RunResult slice for smoke-testing
+// every WriteReport format path. Covers the three interesting cases
+// the formatters branch on: completed with judge, incomplete (stage-1
+// error, no summary), and completed with --skip-judge sentinel.
+func sampleResults() []RunResult {
+	return []RunResult{
+		{
+			Question: Question{
+				Name:              "kitchen-status",
+				ExpectedEntityIDs: []string{"01JX"},
+			},
+			FTSOn: ArmResult{
+				GeneratedSQL: "SELECT 1",
+				SummaryText:  "done",
+				Grade: GradeResult{
+					Rubric:        2,
+					RubricTotal:   3,
+					JudgeScore:    4,
+					EntitiesHit:   1,
+					EntitiesTotal: 1,
+				},
+			},
+			FTSOff: ArmResult{
+				GeneratedSQL: "SELECT 1",
+				SummaryText:  "done",
+				Grade: GradeResult{
+					Rubric:        1,
+					RubricTotal:   3,
+					JudgeScore:    2,
+					EntitiesHit:   0,
+					EntitiesTotal: 1,
+				},
+			},
+		},
+		{
+			Question: Question{Name: "stage1-fail"},
+			FTSOn: ArmResult{
+				ErrorKind: errorKindStage1,
+				ErrorMsg:  "provider down",
+				Grade:     GradeResult{Rubric: 0, RubricTotal: 2, JudgeScore: -1},
+			},
+			FTSOff: ArmResult{
+				Grade: GradeResult{Rubric: 2, RubricTotal: 2, JudgeScore: 3},
+			},
+		},
+		{
+			Question: Question{Name: "skip-judge"},
+			FTSOn:    ArmResult{Grade: GradeResult{Rubric: 3, RubricTotal: 3, JudgeScore: -1}},
+			FTSOff:   ArmResult{Grade: GradeResult{Rubric: 3, RubricTotal: 3, JudgeScore: -1}},
+		},
+	}
+}
+
+// TestWriteReportFormatsDoNotPanic is a smoke test for every format
+// WriteReport supports. Before this test landed, writeTableReport
+// passed nils to lipgloss.HasDarkBackground and SIGSEGV'd at runtime;
+// nothing in the package exercised the styled path. This test drives
+// each format + a few NoAB permutations so any nil-dereference,
+// out-of-range index, or formatter panic surfaces in CI.
+func TestWriteReportFormatsDoNotPanic(t *testing.T) {
+	t.Parallel()
+	results := sampleResults()
+
+	cases := []struct {
+		format string
+		noAB   bool
+	}{
+		{"table", false},
+		{"table", true},
+		{"markdown", false},
+		{"markdown", true},
+		{"json", false},
+		{"", false}, // unknown/empty should fall through to table
+	}
+	for _, tc := range cases {
+		t.Run(tc.format+"-noab-"+strconv.FormatBool(tc.noAB), func(t *testing.T) {
+			var buf bytes.Buffer
+			cfg := Config{
+				Provider: "ollama",
+				Model:    "qwen3",
+				Format:   tc.format,
+				NoAB:     tc.noAB,
+			}
+			require.NotPanics(t, func() {
+				require.NoError(t, WriteReport(&buf, cfg, results))
+			})
+			assert.NotEmpty(t, buf.String(),
+				"format=%q noAB=%v produced no output", tc.format, tc.noAB)
+		})
+	}
+}
+
+// TestWriteReportEmptyResults covers the zero-question case so the
+// aggregate block and tables handle an empty slice without touching
+// out-of-range indices.
+func TestWriteReportEmptyResults(t *testing.T) {
+	t.Parallel()
+	for _, f := range []string{"table", "markdown", "json"} {
+		t.Run(f, func(t *testing.T) {
+			var buf bytes.Buffer
+			require.NotPanics(t, func() {
+				require.NoError(t, WriteReport(&buf, Config{Format: f}, nil))
+			})
+		})
+	}
+}
diff --git a/internal/ftseval/report.go b/internal/ftseval/report.go