@@ -161,6 +161,94 @@ describe("eval runner", () => {
161161 expect ( readFileSync ( path . join ( compareRun . outputDir , "compare.json" ) , "utf-8" ) ) . toContain ( "\"deltas\"" ) ;
162162 } ) ;
163163
164+ it ( "fails fast when baseline summary is missing required diversity metrics" , async ( ) => {
165+ const baselineRun = await runEvaluation ( {
166+ projectRoot : tempDir ,
167+ datasetPath : "benchmarks/golden/small.json" ,
168+ outputRoot : "benchmarks/results" ,
169+ ciMode : false ,
170+ reindex : false ,
171+ } ) ;
172+
173+ const legacyBaseline = {
174+ ...baselineRun . summary ,
175+ metrics : {
176+ ...baselineRun . summary . metrics ,
177+ } ,
178+ } as Record < string , unknown > ;
179+
180+ delete ( legacyBaseline . metrics as Record < string , unknown > ) . distinctTop3Ratio ;
181+ delete ( legacyBaseline . metrics as Record < string , unknown > ) . rawDistinctTop3Ratio ;
182+
183+ const baselinePath = path . join ( tempDir , "benchmarks" , "baselines" , "legacy-baseline-summary.json" ) ;
184+ writeFileSync ( baselinePath , JSON . stringify ( legacyBaseline , null , 2 ) , "utf-8" ) ;
185+
186+ await expect (
187+ runEvaluation ( {
188+ projectRoot : tempDir ,
189+ datasetPath : "benchmarks/golden/small.json" ,
190+ outputRoot : "benchmarks/results" ,
191+ againstPath : "benchmarks/baselines/legacy-baseline-summary.json" ,
192+ ciMode : false ,
193+ reindex : false ,
194+ } )
195+ ) . rejects . toThrow ( / m e t r i c s \. d i s t i n c t T o p 3 R a t i o m u s t b e a f i n i t e n u m b e r / ) ;
196+ } ) ;
197+
198+ it ( "fails ci mode when budget baseline summary is missing required diversity metrics" , async ( ) => {
199+ const baselineRun = await runEvaluation ( {
200+ projectRoot : tempDir ,
201+ datasetPath : "benchmarks/golden/small.json" ,
202+ outputRoot : "benchmarks/results" ,
203+ ciMode : false ,
204+ reindex : false ,
205+ } ) ;
206+
207+ const legacyBaseline = {
208+ ...baselineRun . summary ,
209+ metrics : {
210+ ...baselineRun . summary . metrics ,
211+ } ,
212+ } as Record < string , unknown > ;
213+
214+ delete ( legacyBaseline . metrics as Record < string , unknown > ) . distinctTop3Ratio ;
215+ delete ( legacyBaseline . metrics as Record < string , unknown > ) . rawDistinctTop3Ratio ;
216+
217+ writeFileSync (
218+ path . join ( tempDir , "benchmarks" , "baselines" , "legacy-baseline-summary.json" ) ,
219+ JSON . stringify ( legacyBaseline , null , 2 ) ,
220+ "utf-8"
221+ ) ;
222+
223+ writeFileSync (
224+ path . join ( tempDir , "benchmarks" , "budgets" , "legacy-check.json" ) ,
225+ JSON . stringify (
226+ {
227+ name : "legacy-check" ,
228+ baselinePath : "benchmarks/baselines/legacy-baseline-summary.json" ,
229+ failOnMissingBaseline : true ,
230+ thresholds : {
231+ rawDistinctTop3RatioMaxDrop : 0.1 ,
232+ } ,
233+ } ,
234+ null ,
235+ 2
236+ ) ,
237+ "utf-8"
238+ ) ;
239+
240+ await expect (
241+ runEvaluation ( {
242+ projectRoot : tempDir ,
243+ datasetPath : "benchmarks/golden/small.json" ,
244+ outputRoot : "benchmarks/results" ,
245+ ciMode : true ,
246+ budgetPath : "benchmarks/budgets/legacy-check.json" ,
247+ reindex : false ,
248+ } )
249+ ) . rejects . toThrow ( / m e t r i c s \. d i s t i n c t T o p 3 R a t i o m u s t b e a f i n i t e n u m b e r / ) ;
250+ } ) ;
251+
164252 it ( "fails ci gate when thresholds regress beyond tolerance" , async ( ) => {
165253 const baselineRun = await runEvaluation ( {
166254 projectRoot : tempDir ,
0 commit comments