Skip to content

Commit 926c828

Browse files
committed
fix: allow eval diff to read legacy summaries
1 parent 454766f commit 926c828

File tree

3 files changed

+82
-7
lines changed

3 files changed

+82
-7
lines changed

src/eval/cli.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,12 @@ export async function handleEvalCommand(args: string[], cwd: string): Promise<nu
336336
if (!parsed.againstPath.endsWith(".json")) {
337337
throw new Error("eval diff --against must point to a summary JSON file");
338338
}
339-
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath));
340-
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath));
339+
const currentSummary = loadSummary(path.resolve(parsed.projectRoot, currentPath), {
340+
allowLegacyDiversityMetrics: true,
341+
});
342+
const baselineSummary = loadSummary(path.resolve(parsed.projectRoot, parsed.againstPath), {
343+
allowLegacyDiversityMetrics: true,
344+
});
341345
const comparison = compareSummaries(
342346
currentSummary,
343347
baselineSummary,

src/eval/reports.ts

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,43 @@ import type {
99
SweepAggregateReport,
1010
} from "./types.js";
1111

12+
interface LoadSummaryOptions {
13+
allowLegacyDiversityMetrics?: boolean;
14+
}
15+
1216
function assertFiniteNumber(value: unknown, path: string): number {
1317
if (typeof value !== "number" || Number.isNaN(value) || !Number.isFinite(value)) {
1418
throw new Error(`${path} must be a finite number`);
1519
}
1620
return value;
1721
}
1822

19-
function validateSummary(summary: EvalSummary, summaryPath: string): EvalSummary {
23+
function validateSummary(
24+
summary: EvalSummary,
25+
summaryPath: string,
26+
options?: LoadSummaryOptions
27+
): EvalSummary {
2028
assertFiniteNumber(summary.metrics.hitAt1, `${summaryPath}.metrics.hitAt1`);
2129
assertFiniteNumber(summary.metrics.hitAt3, `${summaryPath}.metrics.hitAt3`);
2230
assertFiniteNumber(summary.metrics.hitAt5, `${summaryPath}.metrics.hitAt5`);
2331
assertFiniteNumber(summary.metrics.hitAt10, `${summaryPath}.metrics.hitAt10`);
2432
assertFiniteNumber(summary.metrics.mrrAt10, `${summaryPath}.metrics.mrrAt10`);
2533
assertFiniteNumber(summary.metrics.ndcgAt10, `${summaryPath}.metrics.ndcgAt10`);
26-
assertFiniteNumber(summary.metrics.distinctTop3Ratio, `${summaryPath}.metrics.distinctTop3Ratio`);
27-
assertFiniteNumber(summary.metrics.rawDistinctTop3Ratio, `${summaryPath}.metrics.rawDistinctTop3Ratio`);
34+
35+
const metrics = summary.metrics as EvalSummary["metrics"] & {
36+
distinctTop3Ratio?: number;
37+
rawDistinctTop3Ratio?: number;
38+
};
39+
40+
if (metrics.distinctTop3Ratio === undefined && options?.allowLegacyDiversityMetrics) {
41+
metrics.distinctTop3Ratio = 0;
42+
}
43+
if (metrics.rawDistinctTop3Ratio === undefined && options?.allowLegacyDiversityMetrics) {
44+
metrics.rawDistinctTop3Ratio = 0;
45+
}
46+
47+
assertFiniteNumber(metrics.distinctTop3Ratio, `${summaryPath}.metrics.distinctTop3Ratio`);
48+
assertFiniteNumber(metrics.rawDistinctTop3Ratio, `${summaryPath}.metrics.rawDistinctTop3Ratio`);
2849
assertFiniteNumber(summary.metrics.latencyMs.p50, `${summaryPath}.metrics.latencyMs.p50`);
2950
assertFiniteNumber(summary.metrics.latencyMs.p95, `${summaryPath}.metrics.latencyMs.p95`);
3051
assertFiniteNumber(summary.metrics.latencyMs.p99, `${summaryPath}.metrics.latencyMs.p99`);
@@ -51,9 +72,9 @@ function signed(value: number, digits = 4): string {
5172
return value > 0 ? `+${formatted}` : formatted;
5273
}
5374

54-
export function loadSummary(summaryPath: string): EvalSummary {
75+
export function loadSummary(summaryPath: string, options?: LoadSummaryOptions): EvalSummary {
5576
const raw = readFileSync(summaryPath, "utf-8");
56-
return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath);
77+
return validateSummary(JSON.parse(raw) as EvalSummary, summaryPath, options);
5778
}
5879

5980
export function createRunDirectory(outputRoot: string, timestampOverride?: string): string {

tests/eval-cli.test.ts

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ describe("eval cli", () => {
108108
hitAt10: 1,
109109
mrrAt10: 1,
110110
ndcgAt10: 1,
111+
distinctTop3Ratio: 1,
112+
rawDistinctTop3Ratio: 1,
111113
latencyMs: { p50: 1, p95: 2, p99: 3 },
112114
tokenEstimate: { queryTokens: 10, embeddingTokensUsed: 20 },
113115
embedding: { callCount: 1, estimatedCostUsd: 0, costPer1MTokensUsd: 0 },
@@ -130,4 +132,52 @@ describe("eval cli", () => {
130132

131133
expect(exitCode).toBe(0);
132134
});
135+
136+
it("allows eval diff to read legacy summaries missing diversity metrics", async () => {
137+
const currentSummaryPath = path.join(tempDir, "current.json");
138+
const baselineSummaryPath = path.join(tempDir, "baseline.json");
139+
140+
const legacySummary = {
141+
generatedAt: new Date().toISOString(),
142+
projectRoot: tempDir,
143+
datasetPath: "benchmarks/golden/small.json",
144+
datasetName: "small",
145+
datasetVersion: "1.0.0",
146+
queryCount: 1,
147+
topK: 10,
148+
searchConfig: {
149+
fusionStrategy: "rrf",
150+
hybridWeight: 0.4,
151+
rrfK: 60,
152+
rerankTopN: 20,
153+
},
154+
metrics: {
155+
hitAt1: 1,
156+
hitAt3: 1,
157+
hitAt5: 1,
158+
hitAt10: 1,
159+
mrrAt10: 1,
160+
ndcgAt10: 1,
161+
latencyMs: { p50: 1, p95: 2, p99: 3 },
162+
tokenEstimate: { queryTokens: 10, embeddingTokensUsed: 20 },
163+
embedding: { callCount: 1, estimatedCostUsd: 0, costPer1MTokensUsd: 0 },
164+
failureBuckets: {
165+
"wrong-file": 0,
166+
"wrong-symbol": 0,
167+
"docs-tests-outranking-source": 0,
168+
"no-relevant-hit-top-k": 0,
169+
},
170+
},
171+
};
172+
173+
writeFileSync(currentSummaryPath, JSON.stringify(legacySummary, null, 2), "utf-8");
174+
writeFileSync(baselineSummaryPath, JSON.stringify(legacySummary, null, 2), "utf-8");
175+
176+
const exitCode = await handleEvalCommand(
177+
["diff", "--current", "current.json", "--against", "baseline.json"],
178+
tempDir
179+
);
180+
181+
expect(exitCode).toBe(0);
182+
});
133183
});

0 commit comments

Comments
 (0)