Skip to content

Commit ee14a56

Browse files
committed
feat: add normalized round metric accounting for voucher effects
Add avg_round_norm, std_round_norm to LeaderboardEntry and final_round_norm to Run to normalize rounds when v_hieroglyph or v_petroglyph vouchers are used. Each voucher reduces the effective ante by 1 (3 rounds penalty). Closes #22.
1 parent 3487367 commit ee14a56

6 files changed

Lines changed: 54 additions & 0 deletions

File tree

src/balatrobench/analyzer.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,22 @@ def _subdirs(path: Path) -> Iterator[Path]:
2929
return (p for p in path.iterdir() if p.is_dir())
3030

3131

32+
def _read_used_vouchers(run_dir: Path) -> set[str]:
33+
"""Read the last entry of gamestates.jsonl and return used_vouchers keys."""
34+
gamestates_path = run_dir / "gamestates.jsonl"
35+
if not gamestates_path.exists():
36+
return set()
37+
38+
# Read last line (most recent gamestate)
39+
with open(gamestates_path, "r") as f:
40+
for line in f:
41+
pass
42+
last_line = line
43+
44+
data = json.loads(last_line)
45+
return set(data.get("used_vouchers", {}).keys())
46+
47+
3248
class BenchmarkAnalyzer:
3349
"""Analyzes BalatroLLM runs and generates benchmark data."""
3450

@@ -175,6 +191,15 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
175191
cost_std=source_stats["cost_std"],
176192
)
177193

194+
# Compute normalized round (accounts for voucher effects)
195+
used_vouchers = _read_used_vouchers(run_dir)
196+
voucher_penalty = 0
197+
if "v_hieroglyph" in used_vouchers:
198+
voucher_penalty += 3
199+
if "v_petroglyph" in used_vouchers:
200+
voucher_penalty += 3
201+
final_round_norm = source_stats["final_round"] - voucher_penalty
202+
178203
# Run - direct field mapping
179204
run = Run(
180205
id=run_dir.name,
@@ -185,6 +210,7 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
185210
run_completed=source_stats["run_completed"],
186211
final_ante=source_stats["final_ante"],
187212
final_round=source_stats["final_round"],
213+
final_round_norm=final_round_norm,
188214
providers=tuple(source_stats["providers"].items()),
189215
stats=stats,
190216
)
@@ -213,6 +239,11 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
213239
avg_round = sum(rounds) / n_runs
214240
std_round = statistics.stdev(rounds) if n_runs > 1 else 0.0
215241

242+
# Normalized round statistics (accounts for voucher effects)
243+
rounds_norm = [r.final_round_norm for r in runs]
244+
avg_round_norm = sum(rounds_norm) / n_runs
245+
std_round_norm = statistics.stdev(rounds_norm) if n_runs > 1 else 0.0
246+
216247
# Call statistics (sum across runs)
217248
calls_total = sum(r.stats.calls_total for r in runs)
218249
calls_success = sum(r.stats.calls_success for r in runs)
@@ -276,6 +307,8 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
276307
run_completed=sum(1 for r in runs if r.run_completed),
277308
avg_round=avg_round,
278309
std_round=std_round,
310+
avg_round_norm=avg_round_norm,
311+
std_round_norm=std_round_norm,
279312
stats=aggregated_stats,
280313
)
281314

@@ -330,6 +363,8 @@ def create_models_leaderboard(
330363
run_completed=entry.run_completed,
331364
avg_round=entry.avg_round,
332365
std_round=entry.std_round,
366+
avg_round_norm=entry.avg_round_norm,
367+
std_round_norm=entry.std_round_norm,
333368
stats=entry.stats,
334369
model=runs.model,
335370
)
@@ -361,6 +396,8 @@ def create_strategies_leaderboard(
361396
run_completed=entry.run_completed,
362397
avg_round=entry.avg_round,
363398
std_round=entry.std_round,
399+
avg_round_norm=entry.avg_round_norm,
400+
std_round_norm=entry.std_round_norm,
364401
stats=entry.stats,
365402
strategy=runs.strategy,
366403
)

src/balatrobench/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ class LeaderboardEntry:
159159
# Round statistics
160160
avg_round: float
161161
std_round: float
162+
avg_round_norm: float # Accounts for voucher effects (v_hieroglyph, v_petroglyph)
163+
std_round_norm: float
162164

163165
# Stats
164166
stats: Stats
@@ -218,6 +220,7 @@ class Run:
218220
# Final game state
219221
final_ante: int
220222
final_round: int
223+
final_round_norm: int # Accounts for voucher effects (v_hieroglyph, v_petroglyph)
221224

222225
# Provider usage distribution (immutable tuple of (name, count) pairs)
223226
providers: tuple[tuple[str, int], ...]

src/balatrobench/source.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
from typing import TypedDict
88

99

10+
class SourceGamestate(TypedDict):
11+
"""gamestates.jsonl entry structure."""
12+
13+
used_vouchers: dict[str, str] # Keys are voucher names like "v_hieroglyph"
14+
15+
1016
class SourceModel(TypedDict):
1117
"""Model identification in task.json."""
1218

tests/balatrobench/unit/test_analyzer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def make_run(
107107
run_completed=run_completed,
108108
final_ante=3,
109109
final_round=final_round,
110+
final_round_norm=final_round,
110111
providers=(("OpenAI", 100),),
111112
stats=stats or make_stats(),
112113
)

tests/balatrobench/unit/test_models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ def test_leaderboard_entry_inheritance(
8989
run_completed=4,
9090
avg_round=8.5,
9191
std_round=2.1,
92+
avg_round_norm=8.5,
93+
std_round_norm=2.1,
9294
stats=sample_stats,
9395
model=sample_model,
9496
)

tests/balatrobench/unit/test_writer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def test_to_dict_nested_dataclass(sample_stats: Stats, sample_model: Model) -> N
3434
run_completed=4,
3535
avg_round=8.5,
3636
std_round=2.1,
37+
avg_round_norm=8.5,
38+
std_round_norm=2.1,
3739
stats=sample_stats,
3840
model=sample_model,
3941
)
@@ -419,6 +421,8 @@ def sample_strategies_leaderboard(
419421
run_completed=2,
420422
avg_round=10.5,
421423
std_round=2.0,
424+
avg_round_norm=10.5,
425+
std_round_norm=2.0,
422426
stats=sample_stats,
423427
strategy=sample_strategy,
424428
)
@@ -484,6 +488,7 @@ def sample_runs(
484488
run_completed=True,
485489
final_ante=3,
486490
final_round=10,
491+
final_round_norm=10,
487492
providers=(("OpenAI", 50),),
488493
stats=sample_stats,
489494
)

0 commit comments

Comments
 (0)