Skip to content

Commit 454766f

Browse files
committed
fix: reject legacy eval baselines missing diversity metrics
1 parent 2372deb commit 454766f

File tree

2 files changed

+114
-1
lines changed

2 files changed

+114
-1
lines changed

src/eval/reports.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,31 @@ import type {
99
SweepAggregateReport,
1010
} from "./types.js";
1111

12+
function assertFiniteNumber(value: unknown, path: string): number {
13+
if (typeof value !== "number" || Number.isNaN(value) || !Number.isFinite(value)) {
14+
throw new Error(`${path} must be a finite number`);
15+
}
16+
return value;
17+
}
18+
19+
function validateSummary(summary: EvalSummary, summaryPath: string): EvalSummary {
20+
assertFiniteNumber(summary.metrics.hitAt1, `${summaryPath}.metrics.hitAt1`);
21+
assertFiniteNumber(summary.metrics.hitAt3, `${summaryPath}.metrics.hitAt3`);
22+
assertFiniteNumber(summary.metrics.hitAt5, `${summaryPath}.metrics.hitAt5`);
23+
assertFiniteNumber(summary.metrics.hitAt10, `${summaryPath}.metrics.hitAt10`);
24+
assertFiniteNumber(summary.metrics.mrrAt10, `${summaryPath}.metrics.mrrAt10`);
25+
assertFiniteNumber(summary.metrics.ndcgAt10, `${summaryPath}.metrics.ndcgAt10`);
26+
assertFiniteNumber(summary.metrics.distinctTop3Ratio, `${summaryPath}.metrics.distinctTop3Ratio`);
27+
assertFiniteNumber(summary.metrics.rawDistinctTop3Ratio, `${summaryPath}.metrics.rawDistinctTop3Ratio`);
28+
assertFiniteNumber(summary.metrics.latencyMs.p50, `${summaryPath}.metrics.latencyMs.p50`);
29+
assertFiniteNumber(summary.metrics.latencyMs.p95, `${summaryPath}.metrics.latencyMs.p95`);
30+
assertFiniteNumber(summary.metrics.latencyMs.p99, `${summaryPath}.metrics.latencyMs.p99`);
31+
assertFiniteNumber(summary.metrics.embedding.callCount, `${summaryPath}.metrics.embedding.callCount`);
32+
assertFiniteNumber(summary.metrics.embedding.estimatedCostUsd, `${summaryPath}.metrics.embedding.estimatedCostUsd`);
33+
34+
return summary;
35+
}
36+
1237
function formatPct(value: number): string {
1338
return `${(value * 100).toFixed(2)}%`;
1439
}
@@ -28,7 +53,7 @@ function signed(value: number, digits = 4): string {
2853

2954
export function loadSummary(summaryPath: string): EvalSummary {
3055
const raw = readFileSync(summaryPath, "utf-8");
31-
return JSON.parse(raw) as EvalSummary;
56+
return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath);
3257
}
3358

3459
export function createRunDirectory(outputRoot: string, timestampOverride?: string): string {

tests/eval-runner.test.ts

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,94 @@ describe("eval runner", () => {
161161
expect(readFileSync(path.join(compareRun.outputDir, "compare.json"), "utf-8")).toContain("\"deltas\"");
162162
});
163163

164+
it("fails fast when baseline summary is missing required diversity metrics", async () => {
165+
const baselineRun = await runEvaluation({
166+
projectRoot: tempDir,
167+
datasetPath: "benchmarks/golden/small.json",
168+
outputRoot: "benchmarks/results",
169+
ciMode: false,
170+
reindex: false,
171+
});
172+
173+
const legacyBaseline = {
174+
...baselineRun.summary,
175+
metrics: {
176+
...baselineRun.summary.metrics,
177+
},
178+
} as Record<string, unknown>;
179+
180+
delete (legacyBaseline.metrics as Record<string, unknown>).distinctTop3Ratio;
181+
delete (legacyBaseline.metrics as Record<string, unknown>).rawDistinctTop3Ratio;
182+
183+
const baselinePath = path.join(tempDir, "benchmarks", "baselines", "legacy-baseline-summary.json");
184+
writeFileSync(baselinePath, JSON.stringify(legacyBaseline, null, 2), "utf-8");
185+
186+
await expect(
187+
runEvaluation({
188+
projectRoot: tempDir,
189+
datasetPath: "benchmarks/golden/small.json",
190+
outputRoot: "benchmarks/results",
191+
againstPath: "benchmarks/baselines/legacy-baseline-summary.json",
192+
ciMode: false,
193+
reindex: false,
194+
})
195+
).rejects.toThrow(/metrics\.distinctTop3Ratio must be a finite number/);
196+
});
197+
198+
it("fails ci mode when budget baseline summary is missing required diversity metrics", async () => {
199+
const baselineRun = await runEvaluation({
200+
projectRoot: tempDir,
201+
datasetPath: "benchmarks/golden/small.json",
202+
outputRoot: "benchmarks/results",
203+
ciMode: false,
204+
reindex: false,
205+
});
206+
207+
const legacyBaseline = {
208+
...baselineRun.summary,
209+
metrics: {
210+
...baselineRun.summary.metrics,
211+
},
212+
} as Record<string, unknown>;
213+
214+
delete (legacyBaseline.metrics as Record<string, unknown>).distinctTop3Ratio;
215+
delete (legacyBaseline.metrics as Record<string, unknown>).rawDistinctTop3Ratio;
216+
217+
writeFileSync(
218+
path.join(tempDir, "benchmarks", "baselines", "legacy-baseline-summary.json"),
219+
JSON.stringify(legacyBaseline, null, 2),
220+
"utf-8"
221+
);
222+
223+
writeFileSync(
224+
path.join(tempDir, "benchmarks", "budgets", "legacy-check.json"),
225+
JSON.stringify(
226+
{
227+
name: "legacy-check",
228+
baselinePath: "benchmarks/baselines/legacy-baseline-summary.json",
229+
failOnMissingBaseline: true,
230+
thresholds: {
231+
rawDistinctTop3RatioMaxDrop: 0.1,
232+
},
233+
},
234+
null,
235+
2
236+
),
237+
"utf-8"
238+
);
239+
240+
await expect(
241+
runEvaluation({
242+
projectRoot: tempDir,
243+
datasetPath: "benchmarks/golden/small.json",
244+
outputRoot: "benchmarks/results",
245+
ciMode: true,
246+
budgetPath: "benchmarks/budgets/legacy-check.json",
247+
reindex: false,
248+
})
249+
).rejects.toThrow(/metrics\.distinctTop3Ratio must be a finite number/);
250+
});
251+
164252
it("fails ci gate when thresholds regress beyond tolerance", async () => {
165253
const baselineRun = await runEvaluation({
166254
projectRoot: tempDir,

0 commit comments

Comments
 (0)