@@ -29,6 +29,22 @@ def _subdirs(path: Path) -> Iterator[Path]:
2929 return (p for p in path .iterdir () if p .is_dir ())
3030
3131
32+ def _read_used_vouchers (run_dir : Path ) -> set [str ]:
33+ """Read the last entry of gamestates.jsonl and return used_vouchers keys."""
34+ gamestates_path = run_dir / "gamestates.jsonl"
35+ if not gamestates_path .exists ():
36+ return set ()
37+
38+ # Read last line (most recent gamestate)
39+ with open (gamestates_path , "r" ) as f :
40+ for line in f :
41+ pass
42+ last_line = line
43+
44+ data = json .loads (last_line )
45+ return set (data .get ("used_vouchers" , {}).keys ())
46+
47+
3248class BenchmarkAnalyzer :
3349 """Analyzes BalatroLLM runs and generates benchmark data."""
3450
@@ -175,6 +191,15 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
175191 cost_std = source_stats ["cost_std" ],
176192 )
177193
194+ # Compute normalized round (accounts for voucher effects)
195+ used_vouchers = _read_used_vouchers (run_dir )
196+ voucher_penalty = 0
197+ if "v_hieroglyph" in used_vouchers :
198+ voucher_penalty += 3
199+ if "v_petroglyph" in used_vouchers :
200+ voucher_penalty += 3
201+ final_round_norm = source_stats ["final_round" ] - voucher_penalty
202+
178203 # Run - direct field mapping
179204 run = Run (
180205 id = run_dir .name ,
@@ -185,6 +210,7 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
185210 run_completed = source_stats ["run_completed" ],
186211 final_ante = source_stats ["final_ante" ],
187212 final_round = source_stats ["final_round" ],
213+ final_round_norm = final_round_norm ,
188214 providers = tuple (source_stats ["providers" ].items ()),
189215 stats = stats ,
190216 )
@@ -213,6 +239,11 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
213239 avg_round = sum (rounds ) / n_runs
214240 std_round = statistics .stdev (rounds ) if n_runs > 1 else 0.0
215241
242+ # Normalized round statistics (accounts for voucher effects)
243+ rounds_norm = [r .final_round_norm for r in runs ]
244+ avg_round_norm = sum (rounds_norm ) / n_runs
245+ std_round_norm = statistics .stdev (rounds_norm ) if n_runs > 1 else 0.0
246+
216247 # Call statistics (sum across runs)
217248 calls_total = sum (r .stats .calls_total for r in runs )
218249 calls_success = sum (r .stats .calls_success for r in runs )
@@ -276,6 +307,8 @@ def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
276307 run_completed = sum (1 for r in runs if r .run_completed ),
277308 avg_round = avg_round ,
278309 std_round = std_round ,
310+ avg_round_norm = avg_round_norm ,
311+ std_round_norm = std_round_norm ,
279312 stats = aggregated_stats ,
280313 )
281314
@@ -330,6 +363,8 @@ def create_models_leaderboard(
330363 run_completed = entry .run_completed ,
331364 avg_round = entry .avg_round ,
332365 std_round = entry .std_round ,
366+ avg_round_norm = entry .avg_round_norm ,
367+ std_round_norm = entry .std_round_norm ,
333368 stats = entry .stats ,
334369 model = runs .model ,
335370 )
@@ -361,6 +396,8 @@ def create_strategies_leaderboard(
361396 run_completed = entry .run_completed ,
362397 avg_round = entry .avg_round ,
363398 std_round = entry .std_round ,
399+ avg_round_norm = entry .avg_round_norm ,
400+ std_round_norm = entry .std_round_norm ,
364401 stats = entry .stats ,
365402 strategy = runs .strategy ,
366403 )
0 commit comments