@@ -28,8 +28,8 @@ def analyzer() -> BenchmarkAnalyzer:
2828
2929
3030@pytest .fixture
31- def sample_strategy () -> Strategy :
32- """Sample Strategy for tests."""
31+ def simple_strategy () -> Strategy :
32+ """Simple Strategy for local tests (distinct from conftest.sample_strategy) ."""
3333 return Strategy (
3434 name = "Default" ,
3535 description = "Test strategy" ,
@@ -159,12 +159,12 @@ def test_subdirs_only_files(tmp_path: Path) -> None:
159159def test_pooled_std_dev_single_run (
160160 analyzer : BenchmarkAnalyzer ,
161161 sample_model_a : Model ,
162- sample_strategy : Strategy ,
162+ simple_strategy : Strategy ,
163163) -> None :
164164 """Single run with any calls_total <= 1 returns 0 for pooled std dev."""
165165 # Create a single run with calls_total = 1
166166 stats = make_stats (calls_total = 1 , tokens_in_std = 10.0 , tokens_in_avg = 100.0 )
167- run = make_run ("run-1" , sample_model_a , sample_strategy , stats = stats )
167+ run = make_run ("run-1" , sample_model_a , simple_strategy , stats = stats )
168168 runs = (run ,)
169169
170170 result = analyzer ._pooled_std_dev_from_runs (
@@ -175,17 +175,17 @@ def test_pooled_std_dev_single_run(
175175 total_n = 1 ,
176176 )
177177
178- assert result == 0.0
178+ assert result == pytest . approx ( 0.0 )
179179
180180
181181def test_pooled_std_dev_zero_total_n (
182182 analyzer : BenchmarkAnalyzer ,
183183 sample_model_a : Model ,
184- sample_strategy : Strategy ,
184+ simple_strategy : Strategy ,
185185) -> None :
186186 """total_n = 0 returns 0 for pooled std dev."""
187187 stats = make_stats (calls_total = 0 , tokens_in_std = 0.0 , tokens_in_avg = 0.0 )
188- run = make_run ("run-1" , sample_model_a , sample_strategy , stats = stats )
188+ run = make_run ("run-1" , sample_model_a , simple_strategy , stats = stats )
189189 runs = (run ,)
190190
191191 result = analyzer ._pooled_std_dev_from_runs (
@@ -196,13 +196,13 @@ def test_pooled_std_dev_zero_total_n(
196196 total_n = 0 ,
197197 )
198198
199- assert result == 0.0
199+ assert result == pytest . approx ( 0.0 )
200200
201201
202202def test_pooled_std_dev_multiple_runs (
203203 analyzer : BenchmarkAnalyzer ,
204204 sample_model_a : Model ,
205- sample_strategy : Strategy ,
205+ simple_strategy : Strategy ,
206206) -> None :
207207 """Pooled std dev is calculated correctly for multiple runs.
208208
@@ -221,11 +221,11 @@ def test_pooled_std_dev_multiple_runs(
221221 """
222222 # Run 1: n=10, mean=100, std=5
223223 stats1 = make_stats (calls_total = 10 , tokens_in_std = 5.0 , tokens_in_avg = 100.0 )
224- run1 = make_run ("run-1" , sample_model_a , sample_strategy , stats = stats1 )
224+ run1 = make_run ("run-1" , sample_model_a , simple_strategy , stats = stats1 )
225225
226226 # Run 2: n=10, mean=110, std=5
227227 stats2 = make_stats (calls_total = 10 , tokens_in_std = 5.0 , tokens_in_avg = 110.0 )
228- run2 = make_run ("run-2" , sample_model_a , sample_strategy , stats = stats2 )
228+ run2 = make_run ("run-2" , sample_model_a , simple_strategy , stats = stats2 )
229229
230230 runs = (run1 , run2 )
231231 overall_mean = 105.0
@@ -252,13 +252,13 @@ def test_pooled_std_dev_multiple_runs(
252252def test_compute_leaderboard_entry_single_run (
253253 analyzer : BenchmarkAnalyzer ,
254254 sample_model_a : Model ,
255- sample_strategy : Strategy ,
255+ simple_strategy : Strategy ,
256256) -> None :
257257 """Single run should have std_round = 0."""
258258 run = make_run (
259259 "run-1" ,
260260 sample_model_a ,
261- sample_strategy ,
261+ simple_strategy ,
262262 final_round = 10 ,
263263 run_won = True ,
264264 run_completed = True ,
@@ -270,14 +270,14 @@ def test_compute_leaderboard_entry_single_run(
270270 assert entry .run_count == 1
271271 assert entry .run_wins == 1
272272 assert entry .run_completed == 1
273- assert entry .avg_round == 10.0
274- assert entry .std_round == 0.0 # Single run = no variance
273+ assert entry .avg_round == pytest . approx ( 10.0 )
274+ assert entry .std_round == pytest . approx ( 0.0 ) # Single run = no variance
275275
276276
277277def test_compute_leaderboard_entry_aggregates (
278278 analyzer : BenchmarkAnalyzer ,
279279 sample_model_a : Model ,
280- sample_strategy : Strategy ,
280+ simple_strategy : Strategy ,
281281) -> None :
282282 """Correctly sums/averages stats across multiple runs."""
283283 # Run 1: rounds=8, won=False, completed=True
@@ -291,7 +291,7 @@ def test_compute_leaderboard_entry_aggregates(
291291 run1 = make_run (
292292 "run-1" ,
293293 sample_model_a ,
294- sample_strategy ,
294+ simple_strategy ,
295295 final_round = 8 ,
296296 run_won = False ,
297297 run_completed = True ,
@@ -309,7 +309,7 @@ def test_compute_leaderboard_entry_aggregates(
309309 run2 = make_run (
310310 "run-2" ,
311311 sample_model_a ,
312- sample_strategy ,
312+ simple_strategy ,
313313 final_round = 12 ,
314314 run_won = True ,
315315 run_completed = True ,
@@ -327,7 +327,7 @@ def test_compute_leaderboard_entry_aggregates(
327327 run3 = make_run (
328328 "run-3" ,
329329 sample_model_a ,
330- sample_strategy ,
330+ simple_strategy ,
331331 final_round = 10 ,
332332 run_won = False ,
333333 run_completed = False ,
@@ -343,29 +343,29 @@ def test_compute_leaderboard_entry_aggregates(
343343 assert entry .run_completed == 2 # run1 and run2 completed
344344
345345 # Round statistics
346- assert entry .avg_round == 10.0 # (8 + 12 + 10) / 3 = 10
346+ assert entry .avg_round == pytest . approx ( 10.0 ) # (8 + 12 + 10) / 3 = 10
347347 # std_round for [8, 12, 10] = statistics.stdev([8, 12, 10])
348348 import statistics
349349
350- assert abs ( entry .std_round - statistics .stdev ([8 , 12 , 10 ])) < 0.001
350+ assert entry .std_round == pytest . approx ( statistics .stdev ([8 , 12 , 10 ]))
351351
352352 # Aggregated stats
353353 assert entry .stats .calls_total == 150 # 50 * 3
354354 assert entry .stats .tokens_in_total == 15000 # 5000 * 3
355355 assert entry .stats .tokens_out_total == 3000 # 1000 * 3
356356 assert entry .stats .time_total_ms == 75000 # 25000 * 3
357- assert entry .stats .cost_total == 1.5 # 0.5 * 3
357+ assert entry .stats .cost_total == pytest . approx ( 1.5 ) # 0.5 * 3
358358
359359 # Per-call averages
360- assert entry .stats .tokens_in_avg == 100.0 # 15000 / 150
361- assert entry .stats .tokens_out_avg == 20.0 # 3000 / 150
362- assert entry .stats .cost_avg == 0.01 # 1.5 / 150
360+ assert entry .stats .tokens_in_avg == pytest . approx ( 100.0 ) # 15000 / 150
361+ assert entry .stats .tokens_out_avg == pytest . approx ( 20.0 ) # 3000 / 150
362+ assert entry .stats .cost_avg == pytest . approx ( 0.01 ) # 1.5 / 150
363363
364364
365365def test_compute_leaderboard_entry_zero_calls (
366366 analyzer : BenchmarkAnalyzer ,
367367 sample_model_a : Model ,
368- sample_strategy : Strategy ,
368+ simple_strategy : Strategy ,
369369) -> None :
370370 """Zero calls_total handles division by zero correctly."""
371371 stats = make_stats (
@@ -379,17 +379,17 @@ def test_compute_leaderboard_entry_zero_calls(
379379 cost_total = 0.0 ,
380380 cost_avg = 0.0 ,
381381 )
382- run = make_run ("run-1" , sample_model_a , sample_strategy , stats = stats )
382+ run = make_run ("run-1" , sample_model_a , simple_strategy , stats = stats )
383383 runs = (run ,)
384384
385385 # Should not raise ZeroDivisionError
386386 entry = analyzer ._compute_leaderboard_entry (runs )
387387
388388 # Averages should all be 0
389- assert entry .stats .tokens_in_avg == 0.0
390- assert entry .stats .tokens_out_avg == 0.0
391- assert entry .stats .time_avg_ms == 0.0
392- assert entry .stats .cost_avg == 0.0
389+ assert entry .stats .tokens_in_avg == pytest . approx ( 0.0 )
390+ assert entry .stats .tokens_out_avg == pytest . approx ( 0.0 )
391+ assert entry .stats .time_avg_ms == pytest . approx ( 0.0 )
392+ assert entry .stats .cost_avg == pytest . approx ( 0.0 )
393393
394394
395395# =============================================================================
@@ -399,86 +399,86 @@ def test_compute_leaderboard_entry_zero_calls(
399399
400400def test_leaderboard_sorting (
401401 analyzer : BenchmarkAnalyzer ,
402- sample_strategy : Strategy ,
402+ simple_strategy : Strategy ,
403403 sample_model_a : Model ,
404404 sample_model_b : Model ,
405405) -> None :
406406 """Leaderboard entries are sorted by avg_round descending."""
407407 # Model A: low avg_round (8)
408- run_a = make_run ("run-a" , sample_model_a , sample_strategy , final_round = 8 )
408+ run_a = make_run ("run-a" , sample_model_a , simple_strategy , final_round = 8 )
409409 runs_a = Runs (
410410 generated_at = 0 ,
411411 model = sample_model_a ,
412- strategy = sample_strategy ,
412+ strategy = simple_strategy ,
413413 runs = (run_a ,),
414414 )
415415
416416 # Model B: high avg_round (15)
417- run_b = make_run ("run-b" , sample_model_b , sample_strategy , final_round = 15 )
417+ run_b = make_run ("run-b" , sample_model_b , simple_strategy , final_round = 15 )
418418 runs_b = Runs (
419419 generated_at = 0 ,
420420 model = sample_model_b ,
421- strategy = sample_strategy ,
421+ strategy = simple_strategy ,
422422 runs = (run_b ,),
423423 )
424424
425425 # Create leaderboard (pass in unsorted order: A first, B second)
426426 with patch ("time.time" , return_value = 1234567890 ):
427427 leaderboard = analyzer .create_models_leaderboard (
428- sample_strategy , [runs_a , runs_b ]
428+ simple_strategy , [runs_a , runs_b ]
429429 )
430430
431431 # Entries should be sorted by avg_round descending (B first, A second)
432432 assert len (leaderboard .entries ) == 2
433433 assert leaderboard .entries [0 ].model == sample_model_b # Higher avg_round
434- assert leaderboard .entries [0 ].avg_round == 15.0
434+ assert leaderboard .entries [0 ].avg_round == pytest . approx ( 15.0 )
435435 assert leaderboard .entries [1 ].model == sample_model_a # Lower avg_round
436- assert leaderboard .entries [1 ].avg_round == 8.0
436+ assert leaderboard .entries [1 ].avg_round == pytest . approx ( 8.0 )
437437
438438 # Verify leaderboard metadata
439- assert leaderboard .strategy == sample_strategy
439+ assert leaderboard .strategy == simple_strategy
440440 assert leaderboard .generated_at == 1234567890
441441
442442
443443def test_create_models_leaderboard_empty_list (
444444 analyzer : BenchmarkAnalyzer ,
445- sample_strategy : Strategy ,
445+ simple_strategy : Strategy ,
446446) -> None :
447447 """Empty runs list creates leaderboard with no entries."""
448448 with patch ("time.time" , return_value = 1234567890 ):
449- leaderboard = analyzer .create_models_leaderboard (sample_strategy , [])
449+ leaderboard = analyzer .create_models_leaderboard (simple_strategy , [])
450450
451451 assert len (leaderboard .entries ) == 0
452- assert leaderboard .strategy == sample_strategy
452+ assert leaderboard .strategy == simple_strategy
453453
454454
455455def test_create_models_leaderboard_ties (
456456 analyzer : BenchmarkAnalyzer ,
457- sample_strategy : Strategy ,
457+ simple_strategy : Strategy ,
458458 sample_model_a : Model ,
459459 sample_model_b : Model ,
460460) -> None :
461461 """Leaderboard handles ties in avg_round (order is stable)."""
462462 # Both models have same avg_round
463- run_a = make_run ("run-a" , sample_model_a , sample_strategy , final_round = 10 )
463+ run_a = make_run ("run-a" , sample_model_a , simple_strategy , final_round = 10 )
464464 runs_a = Runs (
465465 generated_at = 0 ,
466466 model = sample_model_a ,
467- strategy = sample_strategy ,
467+ strategy = simple_strategy ,
468468 runs = (run_a ,),
469469 )
470470
471- run_b = make_run ("run-b" , sample_model_b , sample_strategy , final_round = 10 )
471+ run_b = make_run ("run-b" , sample_model_b , simple_strategy , final_round = 10 )
472472 runs_b = Runs (
473473 generated_at = 0 ,
474474 model = sample_model_b ,
475- strategy = sample_strategy ,
475+ strategy = simple_strategy ,
476476 runs = (run_b ,),
477477 )
478478
479- leaderboard = analyzer .create_models_leaderboard (sample_strategy , [runs_a , runs_b ])
479+ leaderboard = analyzer .create_models_leaderboard (simple_strategy , [runs_a , runs_b ])
480480
481481 # Both should be present with same avg_round
482482 assert len (leaderboard .entries ) == 2
483- assert leaderboard .entries [0 ].avg_round == 10.0
484- assert leaderboard .entries [1 ].avg_round == 10.0
483+ assert leaderboard .entries [0 ].avg_round == pytest . approx ( 10.0 )
484+ assert leaderboard .entries [1 ].avg_round == pytest . approx ( 10.0 )
0 commit comments