Skip to content

Commit 768bb7f

Browse files
committed
fix(llm): make FTS eval judge parser tolerant of real-world model output
The original parser rejected everything except the literal format `C1=<digit> C2=<digit> ...`. Qwen3, DeepSeek-R1, and similar reasoning models emit <think>...</think> preambles and decorate output with markdown (`**C1** = 1`, `- C1: 1`, `**Reason:**`). Under the old parser every one of those replies fell to the "unparseable" branch and the table report showed judge=- with no explanation surfaced. Parser now: - Strips <think>/<thinking>/<reasoning> blocks before matching. - Tolerates C1=1 / C1: 1 / **C1** = 1 / - C1: 1 / mixed case. - Accepts "Rationale:" as an alternative to "Reason:". - First match per criterion wins, so a model restating the criteria list can't override its later verdict. Reported judge_reason also surfaces in the table report's Notes column when JudgeScore == -1, so a judge failure explains itself without requiring --format json. Tests cover each tolerated variant plus the case where only partial scores are present (still returns the -1 sentinel).
1 parent f44d236 commit 768bb7f

3 files changed

Lines changed: 312 additions & 31 deletions

File tree

internal/ftseval/harness.go

Lines changed: 79 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ package ftseval
66
import (
77
"context"
88
"fmt"
9+
"regexp"
10+
"strconv"
911
"strings"
1012
"time"
1113

@@ -264,7 +266,7 @@ func (r *runner) applyEntityHit(arm *ArmResult, q Question, ftsContext string) {
264266
}
265267

266268
// runJudge makes one LLM call asking the judge model to score C1..C5 on a
267-
// 0/1 each, summed to 0-5. The reply is parsed by looking for digits.
269+
// 0/1 each, summed to 0-5.
268270
func (r *runner) runJudge(ctx context.Context, arm *ArmResult, q Question) {
269271
judgePrompt := fmt.Sprintf(
270272
`You are grading an AI assistant's answer to a user's question about their home-management database.
@@ -278,9 +280,9 @@ C5: Is the answer free of irrelevant content?
278280
279281
Extra context for grading: %s
280282
281-
Reply in EXACTLY this format, with no other prose:
282-
C1=<0|1> C2=<0|1> C3=<0|1> C4=<0|1> C5=<0|1>
283-
Reason: <one short sentence>`,
283+
Reply with each score on its own line in the format "Cn=0" or "Cn=1"
284+
(n from 1 to 5), followed by a single line "Reason: <one short sentence>".
285+
Think first if you want, but the grading lines must be present.`,
284286
q.JudgePrompt,
285287
)
286288

@@ -298,51 +300,97 @@ Reason: <one short sentence>`,
298300
}
299301
score, reason := parseJudgeReply(reply)
300302
if score < 0 {
301-
arm.Grade.JudgeReason = "judge reply unparseable: " + strings.TrimSpace(reply)
303+
arm.Grade.JudgeReason = "judge_parse_failed: " + truncateReply(reply)
302304
return
303305
}
304306
arm.Grade.JudgeScore = score
305307
arm.Grade.JudgeReason = reason
306308
}
307309

308-
// parseJudgeReply extracts "C1=0 C2=1 ..." and the reason line. Returns
309-
// score=-1 when the expected pattern is missing.
310+
// judgeScoreRE matches one criterion line. Tolerates:
311+
// - separator variants: `=`, `:`, ` = `, ` : `
312+
// - markdown decoration around the criterion name or score:
313+
// `**C1**=1`, `- C1 = 1`, `**C1**=**1**`
314+
// - case: `c1`, `C1`
315+
//
316+
// The `\** \s*` pair on each side of the separator absorbs `**`
317+
// (markdown bold), leading/trailing whitespace, and stray underscores
318+
// common in model output.
319+
var judgeScoreRE = regexp.MustCompile(`(?i)\bc\s*([1-5])\s*\**\s*[:=]\s*\**\s*([01])\b`)
320+
321+
// judgeReasonRE matches the reason line. Accepts "Reason:", "reason =",
322+
// "Rationale:" etc. (?i) for case; (?s) deliberately NOT set so the
323+
// capture stops at the first newline.
324+
var judgeReasonRE = regexp.MustCompile(`(?i)\b(?:reason|rationale)\b\s*[:=]\s*(.+)`)
325+
326+
// judgeThinkREs strip reasoning-model preambles. Qwen3, DeepSeek-R1,
327+
// and similar models emit `<think>…</think>`, `<thinking>…</thinking>`,
328+
// or `<reasoning>…</reasoning>` before the actual answer. RE2 has no
329+
// backreferences, so we keep one regex per tag name.
330+
var judgeThinkREs = []*regexp.Regexp{
331+
regexp.MustCompile(`(?is)<think>.*?</think>`),
332+
regexp.MustCompile(`(?is)<thinking>.*?</thinking>`),
333+
regexp.MustCompile(`(?is)<reasoning>.*?</reasoning>`),
334+
}
335+
336+
// parseJudgeReply extracts C1..C5 scores and an optional reason from a
337+
// judge-model reply. Returns (-1, "") when any of the five criteria are
338+
// missing. Tolerant of formatting variation: reasoning preambles,
339+
// markdown decoration, `:` vs `=` separators, mixed case, and stray
340+
// whitespace all work.
310341
func parseJudgeReply(reply string) (int, string) {
311-
score := 0
312-
var found int
313-
for i := 1; i <= 5; i++ {
314-
tag := fmt.Sprintf("C%d=", i)
315-
idx := strings.Index(reply, tag)
316-
if idx < 0 {
317-
return -1, ""
342+
for _, re := range judgeThinkREs {
343+
reply = re.ReplaceAllString(reply, "")
344+
}
345+
346+
matches := judgeScoreRE.FindAllStringSubmatch(reply, -1)
347+
byCriterion := map[int]int{}
348+
for _, m := range matches {
349+
idx, err := strconv.Atoi(m[1])
350+
if err != nil || idx < 1 || idx > 5 {
351+
continue
318352
}
319-
after := reply[idx+len(tag):]
320-
if len(after) == 0 {
321-
return -1, ""
353+
val, err := strconv.Atoi(m[2])
354+
if err != nil || (val != 0 && val != 1) {
355+
continue
322356
}
323-
switch after[0] {
324-
case '1':
325-
score++
326-
found++
327-
case '0':
328-
found++
329-
default:
330-
return -1, ""
357+
// First match wins so a model restating the criteria mid-reply
358+
// can't override its final verdict.
359+
if _, seen := byCriterion[idx]; !seen {
360+
byCriterion[idx] = val
331361
}
332362
}
333-
if found != 5 {
363+
if len(byCriterion) != 5 {
334364
return -1, ""
335365
}
366+
score := 0
367+
for i := 1; i <= 5; i++ {
368+
score += byCriterion[i]
369+
}
370+
336371
reason := ""
337-
if idx := strings.Index(reply, "Reason:"); idx >= 0 {
338-
reason = strings.TrimSpace(reply[idx+len("Reason:"):])
339-
if nl := strings.IndexByte(reason, '\n'); nl >= 0 {
340-
reason = reason[:nl]
341-
}
372+
if m := judgeReasonRE.FindStringSubmatch(reply); len(m) >= 2 {
373+
// Strip markdown decoration (** _ *) and whitespace so "Reason:
374+
// **mostly there**" normalizes to "mostly there".
375+
reason = strings.Trim(m[1], " \t*_")
342376
}
343377
return score, reason
344378
}
345379

380+
// truncateReply keeps the last chunk of a judge reply so parse-failure
381+
// reasons surface something recognizable in the report without flooding
382+
// it. Prefers the tail because reasoning models put the verdict at the
383+
// end; the leading <think> block is rarely informative about why the
384+
// parse failed.
385+
func truncateReply(reply string) string {
386+
reply = strings.TrimSpace(reply)
387+
const maxLen = 200
388+
if len(reply) <= maxLen {
389+
return reply
390+
}
391+
return "..." + reply[len(reply)-maxLen:]
392+
}
393+
346394
// ExitCode derives a process exit code from the results under cfg.Strict.
347395
// Returns 0 when nothing regressed and 1 on a per-question rubric
348396
// regression (FTS-on rubric strictly less than FTS-off rubric) among

internal/ftseval/harness_test.go

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ package ftseval
55

66
import (
77
"bytes"
8+
"strconv"
9+
"strings"
810
"testing"
911

1012
"github.com/stretchr/testify/assert"
@@ -26,10 +28,96 @@ func TestParseJudgeReplyMissingTag(t *testing.T) {
2628

2729
func TestParseJudgeReplyInvalidDigit(t *testing.T) {
2830
t.Parallel()
31+
// 9 isn't a valid criterion score; the regex rejects it via [01].
32+
// The remaining four criteria aren't enough to reach 5, so sentinel.
2933
score, _ := parseJudgeReply("C1=9 C2=1 C3=0 C4=1 C5=1")
3034
assert.Equal(t, -1, score, "non-binary digit must produce sentinel -1")
3135
}
3236

37+
func TestParseJudgeReplyColonSeparator(t *testing.T) {
38+
t.Parallel()
39+
score, reason := parseJudgeReply("C1: 1\nC2: 1\nC3: 0\nC4: 1\nC5: 0\nReason: mostly there")
40+
assert.Equal(t, 3, score)
41+
assert.Equal(t, "mostly there", reason)
42+
}
43+
44+
func TestParseJudgeReplyMarkdownDecoration(t *testing.T) {
45+
t.Parallel()
46+
score, reason := parseJudgeReply(`- **C1** = 1
47+
- **C2** = 0
48+
- **C3** = 1
49+
- **C4** = 1
50+
- **C5** = 0
51+
52+
**Reason:** entity naming was off`)
53+
assert.Equal(t, 3, score)
54+
assert.Equal(t, "entity naming was off", reason)
55+
}
56+
57+
func TestParseJudgeReplyStripsThinkBlock(t *testing.T) {
58+
t.Parallel()
59+
// qwen3 / deepseek-r1 emit <think>...</think> preambles. The
60+
// parser must ignore them even when they contain strings that
61+
// look like scores (the model might analyze "C1").
62+
reply := `<think>
63+
Let me analyze C1: the answer addresses the question. I should give it 1.
64+
Similarly C2 through C5... I'll check each.
65+
</think>
66+
67+
C1=1 C2=1 C3=1 C4=0 C5=1
68+
Reason: SQL was awkward but correct`
69+
score, reason := parseJudgeReply(reply)
70+
assert.Equal(t, 4, score)
71+
assert.Equal(t, "SQL was awkward but correct", reason)
72+
}
73+
74+
func TestParseJudgeReplyFirstMatchWins(t *testing.T) {
75+
t.Parallel()
76+
// When a model restates the criteria list before scoring (common
77+
// pattern), the SECOND mention is the verdict. The parser should
78+
// take the first match per criterion instead — restated criteria
79+
// often come with no score attached.
80+
reply := `Restating criteria: C1: ?, C2: ?, C3: ?, C4: ?, C5: ?
81+
Grading:
82+
C1=1 C2=1 C3=0 C4=0 C5=1
83+
Reason: partial credit`
84+
score, _ := parseJudgeReply(reply)
85+
// The "?" lines don't match [01] so they're ignored; only the
86+
// grading lines count. Score = 1+1+0+0+1 = 3.
87+
assert.Equal(t, 3, score)
88+
}
89+
90+
func TestParseJudgeReplyRationaleKeyword(t *testing.T) {
91+
t.Parallel()
92+
score, reason := parseJudgeReply("C1=1 C2=1 C3=1 C4=1 C5=1\nRationale: perfect")
93+
assert.Equal(t, 5, score)
94+
assert.Equal(t, "perfect", reason)
95+
}
96+
97+
func TestParseJudgeReplyWithReasonButNoScores(t *testing.T) {
98+
t.Parallel()
99+
// Only partial scores — must be treated as unparseable even
100+
// though a reason is present.
101+
score, reason := parseJudgeReply("C1=1 C2=1\nReason: incomplete")
102+
assert.Equal(t, -1, score)
103+
assert.Equal(t, "", reason)
104+
}
105+
106+
func TestTruncateReplyShort(t *testing.T) {
107+
t.Parallel()
108+
assert.Equal(t, "short reply", truncateReply(" short reply "))
109+
}
110+
111+
func TestTruncateReplyLong(t *testing.T) {
112+
t.Parallel()
113+
long := strings.Repeat("x", 250) + "THE VERDICT"
114+
got := truncateReply(long)
115+
assert.True(t, strings.HasSuffix(got, "THE VERDICT"),
116+
"truncation must keep the tail (where the verdict lives)")
117+
assert.True(t, strings.HasPrefix(got, "..."),
118+
"truncation must mark the leading cut")
119+
}
120+
33121
func TestStripFences(t *testing.T) {
34122
t.Parallel()
35123
cases := []struct{ in, want string }{
@@ -143,3 +231,110 @@ Vendor "Pacific" (id: 01JY)`
143231
assert.Equal(t, 2, arm.Grade.EntitiesTotal,
144232
"only non-empty expected IDs count toward totals")
145233
}
234+
235+
// sampleResults returns a minimal RunResult slice for smoke-testing
236+
// every WriteReport format path. Covers the three interesting cases
237+
// the formatters branch on: completed with judge, incomplete (stage-1
238+
// error, no summary), and completed with --skip-judge sentinel.
239+
func sampleResults() []RunResult {
240+
return []RunResult{
241+
{
242+
Question: Question{
243+
Name: "kitchen-status",
244+
ExpectedEntityIDs: []string{"01JX"},
245+
},
246+
FTSOn: ArmResult{
247+
GeneratedSQL: "SELECT 1",
248+
SummaryText: "done",
249+
Grade: GradeResult{
250+
Rubric: 2,
251+
RubricTotal: 3,
252+
JudgeScore: 4,
253+
EntitiesHit: 1,
254+
EntitiesTotal: 1,
255+
},
256+
},
257+
FTSOff: ArmResult{
258+
GeneratedSQL: "SELECT 1",
259+
SummaryText: "done",
260+
Grade: GradeResult{
261+
Rubric: 1,
262+
RubricTotal: 3,
263+
JudgeScore: 2,
264+
EntitiesHit: 0,
265+
EntitiesTotal: 1,
266+
},
267+
},
268+
},
269+
{
270+
Question: Question{Name: "stage1-fail"},
271+
FTSOn: ArmResult{
272+
ErrorKind: errorKindStage1,
273+
ErrorMsg: "provider down",
274+
Grade: GradeResult{Rubric: 0, RubricTotal: 2, JudgeScore: -1},
275+
},
276+
FTSOff: ArmResult{
277+
Grade: GradeResult{Rubric: 2, RubricTotal: 2, JudgeScore: 3},
278+
},
279+
},
280+
{
281+
Question: Question{Name: "skip-judge"},
282+
FTSOn: ArmResult{Grade: GradeResult{Rubric: 3, RubricTotal: 3, JudgeScore: -1}},
283+
FTSOff: ArmResult{Grade: GradeResult{Rubric: 3, RubricTotal: 3, JudgeScore: -1}},
284+
},
285+
}
286+
}
287+
288+
// TestWriteReportFormatsDoNotPanic is a smoke test for every format
289+
// WriteReport supports. Before this test landed, writeTableReport
290+
// passed nils to lipgloss.HasDarkBackground and SIGSEGV'd at runtime;
291+
// nothing in the package exercised the styled path. This test drives
292+
// each format + a few NoAB permutations so any nil-dereference,
293+
// out-of-range index, or formatter panic surfaces in CI.
294+
func TestWriteReportFormatsDoNotPanic(t *testing.T) {
295+
t.Parallel()
296+
results := sampleResults()
297+
298+
cases := []struct {
299+
format string
300+
noAB bool
301+
}{
302+
{"table", false},
303+
{"table", true},
304+
{"markdown", false},
305+
{"markdown", true},
306+
{"json", false},
307+
{"", false}, // unknown/empty should fall through to table
308+
}
309+
for _, tc := range cases {
310+
t.Run(tc.format+"-noab-"+strconv.FormatBool(tc.noAB), func(t *testing.T) {
311+
var buf bytes.Buffer
312+
cfg := Config{
313+
Provider: "ollama",
314+
Model: "qwen3",
315+
Format: tc.format,
316+
NoAB: tc.noAB,
317+
}
318+
require.NotPanics(t, func() {
319+
require.NoError(t, WriteReport(&buf, cfg, results))
320+
})
321+
assert.NotEmpty(t, buf.String(),
322+
"format=%q noAB=%v produced no output", tc.format, tc.noAB)
323+
})
324+
}
325+
}
326+
327+
// TestWriteReportEmptyResults covers the zero-question case so the
328+
// aggregate block and tables handle an empty slice without touching
329+
// out-of-range indices.
330+
func TestWriteReportEmptyResults(t *testing.T) {
331+
t.Parallel()
332+
for _, f := range []string{"table", "markdown", "json"} {
333+
t.Run(f, func(t *testing.T) {
334+
var buf bytes.Buffer
335+
require.NotPanics(t, func() {
336+
require.NoError(t, WriteReport(&buf, Config{Format: f}, nil))
337+
})
338+
})
339+
}
340+
}

0 commit comments

Comments
 (0)