feat: add normalized round metric accounting for voucher effects

S1M0N38 · S1M0N38 · commit ee14a56fc2a9 · 2026-02-13T18:03:19.000+01:00
Add avg_round_norm, std_round_norm to LeaderboardEntry and final_round_norm to Run to normalize rounds when v_hieroglyph or v_petroglyph vouchers are used. Each voucher reduces the effective ante by 1 (3 rounds penalty). Closes #22.
diff --git a/src/balatrobench/analyzer.py b/src/balatrobench/analyzer.py
@@ -29,6 +29,22 @@ def _subdirs(path: Path) -> Iterator[Path]:
     return (p for p in path.iterdir() if p.is_dir())
 
 
+def _read_used_vouchers(run_dir: Path) -> set[str]:
+    """Read the last entry of gamestates.jsonl and return used_vouchers keys."""
+    gamestates_path = run_dir / "gamestates.jsonl"
+    if not gamestates_path.exists():
+        return set()
+
+    # Read last line (most recent gamestate)
+    with open(gamestates_path, "r") as f:
+        for line in f:
+            pass
+        last_line = line
+
+    data = json.loads(last_line)
+    return set(data.get("used_vouchers", {}).keys())
+
+
 class BenchmarkAnalyzer:
     """Analyzes BalatroLLM runs and generates benchmark data."""
 
@@ -175,6 +191,15 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
                 cost_std=source_stats["cost_std"],
             )
 
+            # Compute normalized round (accounts for voucher effects)
+            used_vouchers = _read_used_vouchers(run_dir)
+            voucher_penalty = 0
+            if "v_hieroglyph" in used_vouchers:
+                voucher_penalty += 3
+            if "v_petroglyph" in used_vouchers:
+                voucher_penalty += 3
+            final_round_norm = source_stats["final_round"] - voucher_penalty
+
             # Run - direct field mapping
             run = Run(
                 id=run_dir.name,
@@ -185,6 +210,7 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
                 run_completed=source_stats["run_completed"],
                 final_ante=source_stats["final_ante"],
                 final_round=source_stats["final_round"],
+                final_round_norm=final_round_norm,
                 providers=tuple(source_stats["providers"].items()),
                 stats=stats,
             )
@@ -213,6 +239,11 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
         avg_round = sum(rounds) / n_runs
         std_round = statistics.stdev(rounds) if n_runs > 1 else 0.0
 
+        # Normalized round statistics (accounts for voucher effects)
+        rounds_norm = [r.final_round_norm for r in runs]
+        avg_round_norm = sum(rounds_norm) / n_runs
+        std_round_norm = statistics.stdev(rounds_norm) if n_runs > 1 else 0.0
+
         # Call statistics (sum across runs)
         calls_total = sum(r.stats.calls_total for r in runs)
         calls_success = sum(r.stats.calls_success for r in runs)
@@ -276,6 +307,8 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
             run_completed=sum(1 for r in runs if r.run_completed),
             avg_round=avg_round,
             std_round=std_round,
+            avg_round_norm=avg_round_norm,
+            std_round_norm=std_round_norm,
             stats=aggregated_stats,
         )
 
@@ -330,6 +363,8 @@ def create_models_leaderboard(
                 run_completed=entry.run_completed,
                 avg_round=entry.avg_round,
                 std_round=entry.std_round,
+                avg_round_norm=entry.avg_round_norm,
+                std_round_norm=entry.std_round_norm,
                 stats=entry.stats,
                 model=runs.model,
             )
@@ -361,6 +396,8 @@ def create_strategies_leaderboard(
                 run_completed=entry.run_completed,
                 avg_round=entry.avg_round,
                 std_round=entry.std_round,
+                avg_round_norm=entry.avg_round_norm,
+                std_round_norm=entry.std_round_norm,
                 stats=entry.stats,
                 strategy=runs.strategy,
             )
diff --git a/src/balatrobench/models.py b/src/balatrobench/models.py
@@ -159,6 +159,8 @@ class LeaderboardEntry:
     # Round statistics
     avg_round: float
     std_round: float
+    avg_round_norm: float  # Accounts for voucher effects (v_hieroglyph, v_petroglyph)
+    std_round_norm: float
 
     # Stats
     stats: Stats
@@ -218,6 +220,7 @@ class Run:
     # Final game state
     final_ante: int
     final_round: int
+    final_round_norm: int  # Accounts for voucher effects (v_hieroglyph, v_petroglyph)
 
     # Provider usage distribution (immutable tuple of (name, count) pairs)
     providers: tuple[tuple[str, int], ...]
diff --git a/src/balatrobench/source.py b/src/balatrobench/source.py
@@ -7,6 +7,12 @@
 from typing import TypedDict
 
 
+class SourceGamestate(TypedDict):
+    """gamestates.jsonl entry structure."""
+
+    used_vouchers: dict[str, str]  # Keys are voucher names like "v_hieroglyph"
+
+
 class SourceModel(TypedDict):
     """Model identification in task.json."""
 
diff --git a/tests/balatrobench/unit/test_analyzer.py b/tests/balatrobench/unit/test_analyzer.py
@@ -107,6 +107,7 @@ def make_run(
         run_completed=run_completed,
         final_ante=3,
         final_round=final_round,
+        final_round_norm=final_round,
         providers=(("OpenAI", 100),),
         stats=stats or make_stats(),
     )
diff --git a/tests/balatrobench/unit/test_models.py b/tests/balatrobench/unit/test_models.py
@@ -89,6 +89,8 @@ def test_leaderboard_entry_inheritance(
         run_completed=4,
         avg_round=8.5,
         std_round=2.1,
+        avg_round_norm=8.5,
+        std_round_norm=2.1,
         stats=sample_stats,
         model=sample_model,
     )
diff --git a/tests/balatrobench/unit/test_writer.py b/tests/balatrobench/unit/test_writer.py
@@ -34,6 +34,8 @@ def test_to_dict_nested_dataclass(sample_stats: Stats, sample_model: Model) -> N
         run_completed=4,
         avg_round=8.5,
         std_round=2.1,
+        avg_round_norm=8.5,
+        std_round_norm=2.1,
         stats=sample_stats,
         model=sample_model,
     )
@@ -419,6 +421,8 @@ def sample_strategies_leaderboard(
             run_completed=2,
             avg_round=10.5,
             std_round=2.0,
+            avg_round_norm=10.5,
+            std_round_norm=2.0,
             stats=sample_stats,
             strategy=sample_strategy,
         )
@@ -484,6 +488,7 @@ def sample_runs(
             run_completed=True,
             final_ante=3,
             final_round=10,
+            final_round_norm=10,
             providers=(("OpenAI", 50),),
             stats=sample_stats,
         )

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,7 @@ def make_run(`
`107`	`107`	`run_completed=run_completed,`
`108`	`108`	`final_ante=3,`
`109`	`109`	`final_round=final_round,`
	`110`	`+ final_round_norm=final_round,`
`110`	`111`	`providers=(("OpenAI", 100),),`
`111`	`112`	`stats=stats or make_stats(),`
`112`	`113`	`)`
Original file line number	Diff line number	Diff line change
`@@ -89,6 +89,8 @@ def test_leaderboard_entry_inheritance(`
`89`	`89`	`run_completed=4,`
`90`	`90`	`avg_round=8.5,`
`91`	`91`	`std_round=2.1,`
	`92`	`+ avg_round_norm=8.5,`
	`93`	`+ std_round_norm=2.1,`
`92`	`94`	`stats=sample_stats,`
`93`	`95`	`model=sample_model,`
`94`	`96`	`)`