Skip to content

Commit bf8528e

Browse files
committed
fix: rename duplicate fixture and use pytest.approx()
- Rename sample_strategy -> simple_strategy to avoid shadowing the conftest.py fixture - Use pytest.approx() for floating-point comparisons to avoid potential precision issues
1 parent b6dc7a3 commit bf8528e

1 file changed

Lines changed: 51 additions & 51 deletions

File tree

tests/balatrobench/unit/test_analyzer.py

Lines changed: 51 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ def analyzer() -> BenchmarkAnalyzer:
2828

2929

3030
@pytest.fixture
31-
def sample_strategy() -> Strategy:
32-
"""Sample Strategy for tests."""
31+
def simple_strategy() -> Strategy:
32+
"""Simple Strategy for local tests (distinct from conftest.sample_strategy)."""
3333
return Strategy(
3434
name="Default",
3535
description="Test strategy",
@@ -159,12 +159,12 @@ def test_subdirs_only_files(tmp_path: Path) -> None:
159159
def test_pooled_std_dev_single_run(
160160
analyzer: BenchmarkAnalyzer,
161161
sample_model_a: Model,
162-
sample_strategy: Strategy,
162+
simple_strategy: Strategy,
163163
) -> None:
164164
"""Single run with any calls_total <= 1 returns 0 for pooled std dev."""
165165
# Create a single run with calls_total = 1
166166
stats = make_stats(calls_total=1, tokens_in_std=10.0, tokens_in_avg=100.0)
167-
run = make_run("run-1", sample_model_a, sample_strategy, stats=stats)
167+
run = make_run("run-1", sample_model_a, simple_strategy, stats=stats)
168168
runs = (run,)
169169

170170
result = analyzer._pooled_std_dev_from_runs(
@@ -175,17 +175,17 @@ def test_pooled_std_dev_single_run(
175175
total_n=1,
176176
)
177177

178-
assert result == 0.0
178+
assert result == pytest.approx(0.0)
179179

180180

181181
def test_pooled_std_dev_zero_total_n(
182182
analyzer: BenchmarkAnalyzer,
183183
sample_model_a: Model,
184-
sample_strategy: Strategy,
184+
simple_strategy: Strategy,
185185
) -> None:
186186
"""total_n = 0 returns 0 for pooled std dev."""
187187
stats = make_stats(calls_total=0, tokens_in_std=0.0, tokens_in_avg=0.0)
188-
run = make_run("run-1", sample_model_a, sample_strategy, stats=stats)
188+
run = make_run("run-1", sample_model_a, simple_strategy, stats=stats)
189189
runs = (run,)
190190

191191
result = analyzer._pooled_std_dev_from_runs(
@@ -196,13 +196,13 @@ def test_pooled_std_dev_zero_total_n(
196196
total_n=0,
197197
)
198198

199-
assert result == 0.0
199+
assert result == pytest.approx(0.0)
200200

201201

202202
def test_pooled_std_dev_multiple_runs(
203203
analyzer: BenchmarkAnalyzer,
204204
sample_model_a: Model,
205-
sample_strategy: Strategy,
205+
simple_strategy: Strategy,
206206
) -> None:
207207
"""Pooled std dev is calculated correctly for multiple runs.
208208
@@ -221,11 +221,11 @@ def test_pooled_std_dev_multiple_runs(
221221
"""
222222
# Run 1: n=10, mean=100, std=5
223223
stats1 = make_stats(calls_total=10, tokens_in_std=5.0, tokens_in_avg=100.0)
224-
run1 = make_run("run-1", sample_model_a, sample_strategy, stats=stats1)
224+
run1 = make_run("run-1", sample_model_a, simple_strategy, stats=stats1)
225225

226226
# Run 2: n=10, mean=110, std=5
227227
stats2 = make_stats(calls_total=10, tokens_in_std=5.0, tokens_in_avg=110.0)
228-
run2 = make_run("run-2", sample_model_a, sample_strategy, stats=stats2)
228+
run2 = make_run("run-2", sample_model_a, simple_strategy, stats=stats2)
229229

230230
runs = (run1, run2)
231231
overall_mean = 105.0
@@ -252,13 +252,13 @@ def test_pooled_std_dev_multiple_runs(
252252
def test_compute_leaderboard_entry_single_run(
253253
analyzer: BenchmarkAnalyzer,
254254
sample_model_a: Model,
255-
sample_strategy: Strategy,
255+
simple_strategy: Strategy,
256256
) -> None:
257257
"""Single run should have std_round = 0."""
258258
run = make_run(
259259
"run-1",
260260
sample_model_a,
261-
sample_strategy,
261+
simple_strategy,
262262
final_round=10,
263263
run_won=True,
264264
run_completed=True,
@@ -270,14 +270,14 @@ def test_compute_leaderboard_entry_single_run(
270270
assert entry.run_count == 1
271271
assert entry.run_wins == 1
272272
assert entry.run_completed == 1
273-
assert entry.avg_round == 10.0
274-
assert entry.std_round == 0.0 # Single run = no variance
273+
assert entry.avg_round == pytest.approx(10.0)
274+
assert entry.std_round == pytest.approx(0.0) # Single run = no variance
275275

276276

277277
def test_compute_leaderboard_entry_aggregates(
278278
analyzer: BenchmarkAnalyzer,
279279
sample_model_a: Model,
280-
sample_strategy: Strategy,
280+
simple_strategy: Strategy,
281281
) -> None:
282282
"""Correctly sums/averages stats across multiple runs."""
283283
# Run 1: rounds=8, won=False, completed=True
@@ -291,7 +291,7 @@ def test_compute_leaderboard_entry_aggregates(
291291
run1 = make_run(
292292
"run-1",
293293
sample_model_a,
294-
sample_strategy,
294+
simple_strategy,
295295
final_round=8,
296296
run_won=False,
297297
run_completed=True,
@@ -309,7 +309,7 @@ def test_compute_leaderboard_entry_aggregates(
309309
run2 = make_run(
310310
"run-2",
311311
sample_model_a,
312-
sample_strategy,
312+
simple_strategy,
313313
final_round=12,
314314
run_won=True,
315315
run_completed=True,
@@ -327,7 +327,7 @@ def test_compute_leaderboard_entry_aggregates(
327327
run3 = make_run(
328328
"run-3",
329329
sample_model_a,
330-
sample_strategy,
330+
simple_strategy,
331331
final_round=10,
332332
run_won=False,
333333
run_completed=False,
@@ -343,29 +343,29 @@ def test_compute_leaderboard_entry_aggregates(
343343
assert entry.run_completed == 2 # run1 and run2 completed
344344

345345
# Round statistics
346-
assert entry.avg_round == 10.0 # (8 + 12 + 10) / 3 = 10
346+
assert entry.avg_round == pytest.approx(10.0) # (8 + 12 + 10) / 3 = 10
347347
# std_round for [8, 12, 10] = statistics.stdev([8, 12, 10])
348348
import statistics
349349

350-
assert abs(entry.std_round - statistics.stdev([8, 12, 10])) < 0.001
350+
assert entry.std_round == pytest.approx(statistics.stdev([8, 12, 10]))
351351

352352
# Aggregated stats
353353
assert entry.stats.calls_total == 150 # 50 * 3
354354
assert entry.stats.tokens_in_total == 15000 # 5000 * 3
355355
assert entry.stats.tokens_out_total == 3000 # 1000 * 3
356356
assert entry.stats.time_total_ms == 75000 # 25000 * 3
357-
assert entry.stats.cost_total == 1.5 # 0.5 * 3
357+
assert entry.stats.cost_total == pytest.approx(1.5) # 0.5 * 3
358358

359359
# Per-call averages
360-
assert entry.stats.tokens_in_avg == 100.0 # 15000 / 150
361-
assert entry.stats.tokens_out_avg == 20.0 # 3000 / 150
362-
assert entry.stats.cost_avg == 0.01 # 1.5 / 150
360+
assert entry.stats.tokens_in_avg == pytest.approx(100.0) # 15000 / 150
361+
assert entry.stats.tokens_out_avg == pytest.approx(20.0) # 3000 / 150
362+
assert entry.stats.cost_avg == pytest.approx(0.01) # 1.5 / 150
363363

364364

365365
def test_compute_leaderboard_entry_zero_calls(
366366
analyzer: BenchmarkAnalyzer,
367367
sample_model_a: Model,
368-
sample_strategy: Strategy,
368+
simple_strategy: Strategy,
369369
) -> None:
370370
"""Zero calls_total handles division by zero correctly."""
371371
stats = make_stats(
@@ -379,17 +379,17 @@ def test_compute_leaderboard_entry_zero_calls(
379379
cost_total=0.0,
380380
cost_avg=0.0,
381381
)
382-
run = make_run("run-1", sample_model_a, sample_strategy, stats=stats)
382+
run = make_run("run-1", sample_model_a, simple_strategy, stats=stats)
383383
runs = (run,)
384384

385385
# Should not raise ZeroDivisionError
386386
entry = analyzer._compute_leaderboard_entry(runs)
387387

388388
# Averages should all be 0
389-
assert entry.stats.tokens_in_avg == 0.0
390-
assert entry.stats.tokens_out_avg == 0.0
391-
assert entry.stats.time_avg_ms == 0.0
392-
assert entry.stats.cost_avg == 0.0
389+
assert entry.stats.tokens_in_avg == pytest.approx(0.0)
390+
assert entry.stats.tokens_out_avg == pytest.approx(0.0)
391+
assert entry.stats.time_avg_ms == pytest.approx(0.0)
392+
assert entry.stats.cost_avg == pytest.approx(0.0)
393393

394394

395395
# =============================================================================
@@ -399,86 +399,86 @@ def test_compute_leaderboard_entry_zero_calls(
399399

400400
def test_leaderboard_sorting(
401401
analyzer: BenchmarkAnalyzer,
402-
sample_strategy: Strategy,
402+
simple_strategy: Strategy,
403403
sample_model_a: Model,
404404
sample_model_b: Model,
405405
) -> None:
406406
"""Leaderboard entries are sorted by avg_round descending."""
407407
# Model A: low avg_round (8)
408-
run_a = make_run("run-a", sample_model_a, sample_strategy, final_round=8)
408+
run_a = make_run("run-a", sample_model_a, simple_strategy, final_round=8)
409409
runs_a = Runs(
410410
generated_at=0,
411411
model=sample_model_a,
412-
strategy=sample_strategy,
412+
strategy=simple_strategy,
413413
runs=(run_a,),
414414
)
415415

416416
# Model B: high avg_round (15)
417-
run_b = make_run("run-b", sample_model_b, sample_strategy, final_round=15)
417+
run_b = make_run("run-b", sample_model_b, simple_strategy, final_round=15)
418418
runs_b = Runs(
419419
generated_at=0,
420420
model=sample_model_b,
421-
strategy=sample_strategy,
421+
strategy=simple_strategy,
422422
runs=(run_b,),
423423
)
424424

425425
# Create leaderboard (pass in unsorted order: A first, B second)
426426
with patch("time.time", return_value=1234567890):
427427
leaderboard = analyzer.create_models_leaderboard(
428-
sample_strategy, [runs_a, runs_b]
428+
simple_strategy, [runs_a, runs_b]
429429
)
430430

431431
# Entries should be sorted by avg_round descending (B first, A second)
432432
assert len(leaderboard.entries) == 2
433433
assert leaderboard.entries[0].model == sample_model_b # Higher avg_round
434-
assert leaderboard.entries[0].avg_round == 15.0
434+
assert leaderboard.entries[0].avg_round == pytest.approx(15.0)
435435
assert leaderboard.entries[1].model == sample_model_a # Lower avg_round
436-
assert leaderboard.entries[1].avg_round == 8.0
436+
assert leaderboard.entries[1].avg_round == pytest.approx(8.0)
437437

438438
# Verify leaderboard metadata
439-
assert leaderboard.strategy == sample_strategy
439+
assert leaderboard.strategy == simple_strategy
440440
assert leaderboard.generated_at == 1234567890
441441

442442

443443
def test_create_models_leaderboard_empty_list(
444444
analyzer: BenchmarkAnalyzer,
445-
sample_strategy: Strategy,
445+
simple_strategy: Strategy,
446446
) -> None:
447447
"""Empty runs list creates leaderboard with no entries."""
448448
with patch("time.time", return_value=1234567890):
449-
leaderboard = analyzer.create_models_leaderboard(sample_strategy, [])
449+
leaderboard = analyzer.create_models_leaderboard(simple_strategy, [])
450450

451451
assert len(leaderboard.entries) == 0
452-
assert leaderboard.strategy == sample_strategy
452+
assert leaderboard.strategy == simple_strategy
453453

454454

455455
def test_create_models_leaderboard_ties(
456456
analyzer: BenchmarkAnalyzer,
457-
sample_strategy: Strategy,
457+
simple_strategy: Strategy,
458458
sample_model_a: Model,
459459
sample_model_b: Model,
460460
) -> None:
461461
"""Leaderboard handles ties in avg_round (order is stable)."""
462462
# Both models have same avg_round
463-
run_a = make_run("run-a", sample_model_a, sample_strategy, final_round=10)
463+
run_a = make_run("run-a", sample_model_a, simple_strategy, final_round=10)
464464
runs_a = Runs(
465465
generated_at=0,
466466
model=sample_model_a,
467-
strategy=sample_strategy,
467+
strategy=simple_strategy,
468468
runs=(run_a,),
469469
)
470470

471-
run_b = make_run("run-b", sample_model_b, sample_strategy, final_round=10)
471+
run_b = make_run("run-b", sample_model_b, simple_strategy, final_round=10)
472472
runs_b = Runs(
473473
generated_at=0,
474474
model=sample_model_b,
475-
strategy=sample_strategy,
475+
strategy=simple_strategy,
476476
runs=(run_b,),
477477
)
478478

479-
leaderboard = analyzer.create_models_leaderboard(sample_strategy, [runs_a, runs_b])
479+
leaderboard = analyzer.create_models_leaderboard(simple_strategy, [runs_a, runs_b])
480480

481481
# Both should be present with same avg_round
482482
assert len(leaderboard.entries) == 2
483-
assert leaderboard.entries[0].avg_round == 10.0
484-
assert leaderboard.entries[1].avg_round == 10.0
483+
assert leaderboard.entries[0].avg_round == pytest.approx(10.0)
484+
assert leaderboard.entries[1].avg_round == pytest.approx(10.0)

0 commit comments

Comments
 (0)