Skip to content

Commit abd56fa

Browse files
committed
feat(analyzer): add benchmark analysis engine for BalatroLLM runs
1 parent 3a2dc3d commit abd56fa

1 file changed

Lines changed: 372 additions & 0 deletions

File tree

src/balatrobench/analyzer.py

Lines changed: 372 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,372 @@
1+
"""Benchmark analysis for BalatroLLM runs."""
2+
3+
import json
4+
import statistics
5+
import time
6+
from collections import defaultdict
7+
from collections.abc import Iterator
8+
from pathlib import Path
9+
10+
from .enums import Deck, Stake
11+
from .models import (
12+
Config,
13+
LeaderboardEntry,
14+
Model,
15+
ModelsLeaderboard,
16+
ModelsLeaderboardEntry,
17+
Run,
18+
Runs,
19+
Stats,
20+
StrategiesLeaderboard,
21+
StrategiesLeaderboardEntry,
22+
Strategy,
23+
)
24+
from .source import SourceStats, SourceStrategy, SourceTask
25+
26+
27+
def _subdirs(path: Path) -> Iterator[Path]:
28+
"""Yield only subdirectories of a path."""
29+
return (p for p in path.iterdir() if p.is_dir())
30+
31+
32+
class BenchmarkAnalyzer:
33+
"""Analyzes BalatroLLM runs and generates benchmark data."""
34+
35+
def __init__(
36+
self,
37+
runs_dir: Path = Path("runs"),
38+
output_dir: Path = Path("site/benchmarks"),
39+
) -> None:
40+
self.runs_dir = runs_dir
41+
self.output_dir = output_dir
42+
43+
def analyze_models(self, version_dir: Path) -> dict[str, list[Runs]]:
44+
"""Analyze a version by comparing models within each strategy.
45+
46+
Returns a dict mapping strategy_name to list of Runs.
47+
"""
48+
if not version_dir.is_dir():
49+
raise FileNotFoundError(f"Version directory not found: {version_dir}")
50+
51+
result: dict[str, list[Runs]] = {}
52+
for strategy_dir in _subdirs(version_dir):
53+
result[strategy_dir.name] = self._analyze_strategy(strategy_dir)
54+
55+
return result
56+
57+
def analyze_strategies(self, version_dir: Path) -> dict[str, list[Runs]]:
58+
"""Analyze a version by comparing strategies for each model.
59+
60+
Returns a dict mapping "vendor/model" to list of Runs (one per strategy).
61+
"""
62+
if not version_dir.is_dir():
63+
raise FileNotFoundError(f"Version directory not found: {version_dir}")
64+
65+
# Collect all model directories with their strategies
66+
models_by_key: defaultdict[str, list[Path]] = defaultdict(list)
67+
68+
for strategy_dir in _subdirs(version_dir):
69+
for vendor_dir in _subdirs(strategy_dir):
70+
for model_dir in _subdirs(vendor_dir):
71+
model_key = f"{vendor_dir.name}/{model_dir.name}"
72+
models_by_key[model_key].append(model_dir)
73+
74+
# Analyze each model across strategies
75+
result: dict[str, list[Runs]] = {}
76+
for model_key, model_dirs in models_by_key.items():
77+
runs_list = []
78+
for model_dir in model_dirs:
79+
runs = self._compute_runs(model_dir)
80+
if runs:
81+
runs_list.append(runs)
82+
result[model_key] = runs_list
83+
84+
return result
85+
86+
def _analyze_strategy(self, strategy_dir: Path) -> list[Runs]:
87+
"""Analyze all models within a strategy directory."""
88+
runs_list: list[Runs] = []
89+
90+
for vendor_dir in _subdirs(strategy_dir):
91+
for model_dir in _subdirs(vendor_dir):
92+
runs = self._compute_runs(model_dir)
93+
if runs:
94+
runs_list.append(runs)
95+
96+
return runs_list
97+
98+
def _compute_runs(self, model_dir: Path) -> Runs | None:
99+
"""Compute Runs from a model's run directories."""
100+
run_list: list[Run] = []
101+
strategy_obj: Strategy | None = None
102+
model_obj: Model | None = None
103+
104+
for run_dir in _subdirs(model_dir):
105+
stats_file = run_dir / "stats.json"
106+
task_file = run_dir / "task.json"
107+
strategy_file = run_dir / "strategy.json"
108+
109+
if not stats_file.exists() or not task_file.exists():
110+
print(f"Skipping incomplete run: {run_dir.name}")
111+
continue
112+
113+
# Load source files
114+
with stats_file.open() as f:
115+
source_stats: SourceStats = json.load(f)
116+
with task_file.open() as f:
117+
source_task: SourceTask = json.load(f)
118+
119+
# Model from structured object (direct mapping)
120+
if model_obj is None:
121+
model_obj = Model(
122+
vendor=source_task["model"]["vendor"],
123+
name=source_task["model"]["name"],
124+
)
125+
126+
# Load strategy (once)
127+
if strategy_obj is None:
128+
if strategy_file.exists():
129+
with strategy_file.open() as f:
130+
source_strategy: SourceStrategy = json.load(f)
131+
strategy_obj = Strategy(
132+
name=source_strategy["name"],
133+
description=source_strategy["description"],
134+
author=source_strategy["author"],
135+
version=source_strategy["version"],
136+
tags=tuple(source_strategy["tags"]),
137+
)
138+
else:
139+
strategy_obj = Strategy(
140+
name=source_task["strategy"],
141+
description="",
142+
author="",
143+
version="",
144+
tags=(),
145+
)
146+
147+
# Create Config for this run
148+
config = Config(
149+
seed=source_task["seed"],
150+
deck=Deck(source_task["deck"]),
151+
stake=Stake(source_task["stake"]),
152+
)
153+
154+
# Stats - direct 1:1 mapping (no flattening needed)
155+
stats = Stats(
156+
calls_total=source_stats["calls_total"],
157+
calls_success=source_stats["calls_success"],
158+
calls_error=source_stats["calls_error"],
159+
calls_failed=source_stats["calls_failed"],
160+
tokens_in_total=source_stats["tokens_in_total"],
161+
tokens_out_total=source_stats["tokens_out_total"],
162+
tokens_in_avg=source_stats["tokens_in_avg"],
163+
tokens_out_avg=source_stats["tokens_out_avg"],
164+
tokens_in_std=source_stats["tokens_in_std"],
165+
tokens_out_std=source_stats["tokens_out_std"],
166+
time_total_ms=source_stats["time_total_ms"],
167+
time_avg_ms=source_stats["time_avg_ms"],
168+
time_std_ms=source_stats["time_std_ms"],
169+
cost_total=source_stats["cost_total"],
170+
cost_avg=source_stats["cost_avg"],
171+
cost_std=source_stats["cost_std"],
172+
)
173+
174+
# Run - direct field mapping
175+
run = Run(
176+
id=run_dir.name,
177+
model=model_obj,
178+
strategy=strategy_obj,
179+
config=config,
180+
run_won=source_stats["run_won"],
181+
run_completed=source_stats["run_completed"],
182+
final_ante=source_stats["final_ante"],
183+
final_round=source_stats["final_round"],
184+
providers=tuple(source_stats["providers"].items()),
185+
stats=stats,
186+
)
187+
run_list.append(run)
188+
189+
if not run_list:
190+
return None
191+
192+
# model_obj and strategy_obj are guaranteed non-None when run_list is populated
193+
assert model_obj is not None
194+
assert strategy_obj is not None
195+
196+
return Runs(
197+
generated_at=int(time.time()),
198+
model=model_obj,
199+
strategy=strategy_obj,
200+
runs=tuple(run_list),
201+
)
202+
203+
def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry:
204+
"""Aggregate Runs into a LeaderboardEntry (base stats only)."""
205+
n_runs = len(runs)
206+
207+
# Round statistics
208+
rounds = [r.final_round for r in runs]
209+
avg_round = sum(rounds) / n_runs
210+
std_round = statistics.stdev(rounds) if n_runs > 1 else 0.0
211+
212+
# Call statistics (sum across runs)
213+
calls_total = sum(r.stats.calls_total for r in runs)
214+
calls_success = sum(r.stats.calls_success for r in runs)
215+
calls_error = sum(r.stats.calls_error for r in runs)
216+
calls_failed = sum(r.stats.calls_failed for r in runs)
217+
218+
# Token totals
219+
tokens_in_total = sum(r.stats.tokens_in_total for r in runs)
220+
tokens_out_total = sum(r.stats.tokens_out_total for r in runs)
221+
222+
# Time and cost totals
223+
time_total_ms = sum(r.stats.time_total_ms for r in runs)
224+
cost_total = sum(r.stats.cost_total for r in runs)
225+
226+
# Per-call averages (pooled across all runs)
227+
if calls_total > 0:
228+
tokens_in_avg = tokens_in_total / calls_total
229+
tokens_out_avg = tokens_out_total / calls_total
230+
time_avg_ms = time_total_ms / calls_total
231+
cost_avg = cost_total / calls_total
232+
else:
233+
tokens_in_avg = tokens_out_avg = time_avg_ms = cost_avg = 0.0
234+
235+
# Pooled standard deviations (computed from per-run stats)
236+
tokens_in_std = self._pooled_std_dev_from_runs(
237+
runs, "tokens_in_std", "tokens_in_avg", tokens_in_avg, calls_total
238+
)
239+
tokens_out_std = self._pooled_std_dev_from_runs(
240+
runs, "tokens_out_std", "tokens_out_avg", tokens_out_avg, calls_total
241+
)
242+
time_std_ms = self._pooled_std_dev_from_runs(
243+
runs, "time_std_ms", "time_avg_ms", time_avg_ms, calls_total
244+
)
245+
cost_std = self._pooled_std_dev_from_runs(
246+
runs, "cost_std", "cost_avg", cost_avg, calls_total
247+
)
248+
249+
# Create aggregated Stats
250+
aggregated_stats = Stats(
251+
calls_total=calls_total,
252+
calls_success=calls_success,
253+
calls_error=calls_error,
254+
calls_failed=calls_failed,
255+
tokens_in_total=tokens_in_total,
256+
tokens_out_total=tokens_out_total,
257+
tokens_in_avg=tokens_in_avg,
258+
tokens_out_avg=tokens_out_avg,
259+
tokens_in_std=tokens_in_std,
260+
tokens_out_std=tokens_out_std,
261+
time_total_ms=time_total_ms,
262+
time_avg_ms=time_avg_ms,
263+
time_std_ms=time_std_ms,
264+
cost_total=cost_total,
265+
cost_avg=cost_avg,
266+
cost_std=cost_std,
267+
)
268+
269+
return LeaderboardEntry(
270+
run_count=n_runs,
271+
run_wins=sum(1 for r in runs if r.run_won),
272+
run_completed=sum(1 for r in runs if r.run_completed),
273+
avg_round=avg_round,
274+
std_round=std_round,
275+
stats=aggregated_stats,
276+
)
277+
278+
def _pooled_std_dev_from_runs(
279+
self,
280+
runs: tuple[Run, ...],
281+
std_attr: str,
282+
avg_attr: str,
283+
overall_mean: float,
284+
total_n: int,
285+
) -> float:
286+
"""Compute pooled standard deviation across multiple runs.
287+
288+
Uses the formula for pooled variance when combining samples with
289+
different means and standard deviations.
290+
291+
Args:
292+
runs: Tuple of Run objects
293+
std_attr: Attribute name for std dev on Run.stats (e.g., "tokens_in_std")
294+
avg_attr: Attribute name for average on Run.stats (e.g., "tokens_in_avg")
295+
overall_mean: The overall mean across all runs
296+
total_n: Total number of observations (calls) across all runs
297+
"""
298+
if total_n <= 1:
299+
return 0.0
300+
301+
numerator = 0.0
302+
for run in runs:
303+
s_i = getattr(run.stats, std_attr)
304+
mean_i = getattr(run.stats, avg_attr)
305+
n_i = run.stats.calls_total
306+
307+
numerator += (n_i - 1) * (s_i**2) + n_i * ((mean_i - overall_mean) ** 2)
308+
309+
pooled_var = numerator / (total_n - 1)
310+
return pooled_var**0.5
311+
312+
def create_models_leaderboard(
313+
self, strategy: Strategy, runs_list: list[Runs]
314+
) -> ModelsLeaderboard:
315+
"""Create a ModelsLeaderboard from Runs list.
316+
317+
Compares different models using the same strategy.
318+
"""
319+
# Compute leaderboard entry for each Runs and pair with model
320+
entries_with_avg: list[tuple[ModelsLeaderboardEntry, float]] = []
321+
for runs in runs_list:
322+
entry = self._compute_leaderboard_entry(runs.runs)
323+
model_entry = ModelsLeaderboardEntry(
324+
run_count=entry.run_count,
325+
run_wins=entry.run_wins,
326+
run_completed=entry.run_completed,
327+
avg_round=entry.avg_round,
328+
std_round=entry.std_round,
329+
stats=entry.stats,
330+
model=runs.model,
331+
)
332+
entries_with_avg.append((model_entry, entry.avg_round))
333+
334+
# Sort by avg_round descending
335+
entries_with_avg.sort(key=lambda x: x[1], reverse=True)
336+
337+
return ModelsLeaderboard(
338+
generated_at=int(time.time()),
339+
strategy=strategy,
340+
entries=tuple(e[0] for e in entries_with_avg),
341+
)
342+
343+
def create_strategies_leaderboard(
344+
self, model: Model, runs_list: list[Runs]
345+
) -> StrategiesLeaderboard:
346+
"""Create a StrategiesLeaderboard from Runs list.
347+
348+
Compares different strategies for the same model.
349+
"""
350+
# Compute leaderboard entry for each Runs and pair with strategy
351+
entries_with_avg: list[tuple[StrategiesLeaderboardEntry, float]] = []
352+
for runs in runs_list:
353+
entry = self._compute_leaderboard_entry(runs.runs)
354+
strategy_entry = StrategiesLeaderboardEntry(
355+
run_count=entry.run_count,
356+
run_wins=entry.run_wins,
357+
run_completed=entry.run_completed,
358+
avg_round=entry.avg_round,
359+
std_round=entry.std_round,
360+
stats=entry.stats,
361+
strategy=runs.strategy,
362+
)
363+
entries_with_avg.append((strategy_entry, entry.avg_round))
364+
365+
# Sort by avg_round descending
366+
entries_with_avg.sort(key=lambda x: x[1], reverse=True)
367+
368+
return StrategiesLeaderboard(
369+
generated_at=int(time.time()),
370+
model=model,
371+
entries=tuple(e[0] for e in entries_with_avg),
372+
)

0 commit comments

Comments
 (0)