|
| 1 | +"""Benchmark analysis for BalatroLLM runs.""" |
| 2 | + |
| 3 | +import json |
| 4 | +import statistics |
| 5 | +import time |
| 6 | +from collections import defaultdict |
| 7 | +from collections.abc import Iterator |
| 8 | +from pathlib import Path |
| 9 | + |
| 10 | +from .enums import Deck, Stake |
| 11 | +from .models import ( |
| 12 | + Config, |
| 13 | + LeaderboardEntry, |
| 14 | + Model, |
| 15 | + ModelsLeaderboard, |
| 16 | + ModelsLeaderboardEntry, |
| 17 | + Run, |
| 18 | + Runs, |
| 19 | + Stats, |
| 20 | + StrategiesLeaderboard, |
| 21 | + StrategiesLeaderboardEntry, |
| 22 | + Strategy, |
| 23 | +) |
| 24 | +from .source import SourceStats, SourceStrategy, SourceTask |
| 25 | + |
| 26 | + |
| 27 | +def _subdirs(path: Path) -> Iterator[Path]: |
| 28 | + """Yield only subdirectories of a path.""" |
| 29 | + return (p for p in path.iterdir() if p.is_dir()) |
| 30 | + |
| 31 | + |
| 32 | +class BenchmarkAnalyzer: |
| 33 | + """Analyzes BalatroLLM runs and generates benchmark data.""" |
| 34 | + |
| 35 | + def __init__( |
| 36 | + self, |
| 37 | + runs_dir: Path = Path("runs"), |
| 38 | + output_dir: Path = Path("site/benchmarks"), |
| 39 | + ) -> None: |
| 40 | + self.runs_dir = runs_dir |
| 41 | + self.output_dir = output_dir |
| 42 | + |
| 43 | + def analyze_models(self, version_dir: Path) -> dict[str, list[Runs]]: |
| 44 | + """Analyze a version by comparing models within each strategy. |
| 45 | +
|
| 46 | + Returns a dict mapping strategy_name to list of Runs. |
| 47 | + """ |
| 48 | + if not version_dir.is_dir(): |
| 49 | + raise FileNotFoundError(f"Version directory not found: {version_dir}") |
| 50 | + |
| 51 | + result: dict[str, list[Runs]] = {} |
| 52 | + for strategy_dir in _subdirs(version_dir): |
| 53 | + result[strategy_dir.name] = self._analyze_strategy(strategy_dir) |
| 54 | + |
| 55 | + return result |
| 56 | + |
| 57 | + def analyze_strategies(self, version_dir: Path) -> dict[str, list[Runs]]: |
| 58 | + """Analyze a version by comparing strategies for each model. |
| 59 | +
|
| 60 | + Returns a dict mapping "vendor/model" to list of Runs (one per strategy). |
| 61 | + """ |
| 62 | + if not version_dir.is_dir(): |
| 63 | + raise FileNotFoundError(f"Version directory not found: {version_dir}") |
| 64 | + |
| 65 | + # Collect all model directories with their strategies |
| 66 | + models_by_key: defaultdict[str, list[Path]] = defaultdict(list) |
| 67 | + |
| 68 | + for strategy_dir in _subdirs(version_dir): |
| 69 | + for vendor_dir in _subdirs(strategy_dir): |
| 70 | + for model_dir in _subdirs(vendor_dir): |
| 71 | + model_key = f"{vendor_dir.name}/{model_dir.name}" |
| 72 | + models_by_key[model_key].append(model_dir) |
| 73 | + |
| 74 | + # Analyze each model across strategies |
| 75 | + result: dict[str, list[Runs]] = {} |
| 76 | + for model_key, model_dirs in models_by_key.items(): |
| 77 | + runs_list = [] |
| 78 | + for model_dir in model_dirs: |
| 79 | + runs = self._compute_runs(model_dir) |
| 80 | + if runs: |
| 81 | + runs_list.append(runs) |
| 82 | + result[model_key] = runs_list |
| 83 | + |
| 84 | + return result |
| 85 | + |
| 86 | + def _analyze_strategy(self, strategy_dir: Path) -> list[Runs]: |
| 87 | + """Analyze all models within a strategy directory.""" |
| 88 | + runs_list: list[Runs] = [] |
| 89 | + |
| 90 | + for vendor_dir in _subdirs(strategy_dir): |
| 91 | + for model_dir in _subdirs(vendor_dir): |
| 92 | + runs = self._compute_runs(model_dir) |
| 93 | + if runs: |
| 94 | + runs_list.append(runs) |
| 95 | + |
| 96 | + return runs_list |
| 97 | + |
| 98 | + def _compute_runs(self, model_dir: Path) -> Runs | None: |
| 99 | + """Compute Runs from a model's run directories.""" |
| 100 | + run_list: list[Run] = [] |
| 101 | + strategy_obj: Strategy | None = None |
| 102 | + model_obj: Model | None = None |
| 103 | + |
| 104 | + for run_dir in _subdirs(model_dir): |
| 105 | + stats_file = run_dir / "stats.json" |
| 106 | + task_file = run_dir / "task.json" |
| 107 | + strategy_file = run_dir / "strategy.json" |
| 108 | + |
| 109 | + if not stats_file.exists() or not task_file.exists(): |
| 110 | + print(f"Skipping incomplete run: {run_dir.name}") |
| 111 | + continue |
| 112 | + |
| 113 | + # Load source files |
| 114 | + with stats_file.open() as f: |
| 115 | + source_stats: SourceStats = json.load(f) |
| 116 | + with task_file.open() as f: |
| 117 | + source_task: SourceTask = json.load(f) |
| 118 | + |
| 119 | + # Model from structured object (direct mapping) |
| 120 | + if model_obj is None: |
| 121 | + model_obj = Model( |
| 122 | + vendor=source_task["model"]["vendor"], |
| 123 | + name=source_task["model"]["name"], |
| 124 | + ) |
| 125 | + |
| 126 | + # Load strategy (once) |
| 127 | + if strategy_obj is None: |
| 128 | + if strategy_file.exists(): |
| 129 | + with strategy_file.open() as f: |
| 130 | + source_strategy: SourceStrategy = json.load(f) |
| 131 | + strategy_obj = Strategy( |
| 132 | + name=source_strategy["name"], |
| 133 | + description=source_strategy["description"], |
| 134 | + author=source_strategy["author"], |
| 135 | + version=source_strategy["version"], |
| 136 | + tags=tuple(source_strategy["tags"]), |
| 137 | + ) |
| 138 | + else: |
| 139 | + strategy_obj = Strategy( |
| 140 | + name=source_task["strategy"], |
| 141 | + description="", |
| 142 | + author="", |
| 143 | + version="", |
| 144 | + tags=(), |
| 145 | + ) |
| 146 | + |
| 147 | + # Create Config for this run |
| 148 | + config = Config( |
| 149 | + seed=source_task["seed"], |
| 150 | + deck=Deck(source_task["deck"]), |
| 151 | + stake=Stake(source_task["stake"]), |
| 152 | + ) |
| 153 | + |
| 154 | + # Stats - direct 1:1 mapping (no flattening needed) |
| 155 | + stats = Stats( |
| 156 | + calls_total=source_stats["calls_total"], |
| 157 | + calls_success=source_stats["calls_success"], |
| 158 | + calls_error=source_stats["calls_error"], |
| 159 | + calls_failed=source_stats["calls_failed"], |
| 160 | + tokens_in_total=source_stats["tokens_in_total"], |
| 161 | + tokens_out_total=source_stats["tokens_out_total"], |
| 162 | + tokens_in_avg=source_stats["tokens_in_avg"], |
| 163 | + tokens_out_avg=source_stats["tokens_out_avg"], |
| 164 | + tokens_in_std=source_stats["tokens_in_std"], |
| 165 | + tokens_out_std=source_stats["tokens_out_std"], |
| 166 | + time_total_ms=source_stats["time_total_ms"], |
| 167 | + time_avg_ms=source_stats["time_avg_ms"], |
| 168 | + time_std_ms=source_stats["time_std_ms"], |
| 169 | + cost_total=source_stats["cost_total"], |
| 170 | + cost_avg=source_stats["cost_avg"], |
| 171 | + cost_std=source_stats["cost_std"], |
| 172 | + ) |
| 173 | + |
| 174 | + # Run - direct field mapping |
| 175 | + run = Run( |
| 176 | + id=run_dir.name, |
| 177 | + model=model_obj, |
| 178 | + strategy=strategy_obj, |
| 179 | + config=config, |
| 180 | + run_won=source_stats["run_won"], |
| 181 | + run_completed=source_stats["run_completed"], |
| 182 | + final_ante=source_stats["final_ante"], |
| 183 | + final_round=source_stats["final_round"], |
| 184 | + providers=tuple(source_stats["providers"].items()), |
| 185 | + stats=stats, |
| 186 | + ) |
| 187 | + run_list.append(run) |
| 188 | + |
| 189 | + if not run_list: |
| 190 | + return None |
| 191 | + |
| 192 | + # model_obj and strategy_obj are guaranteed non-None when run_list is populated |
| 193 | + assert model_obj is not None |
| 194 | + assert strategy_obj is not None |
| 195 | + |
| 196 | + return Runs( |
| 197 | + generated_at=int(time.time()), |
| 198 | + model=model_obj, |
| 199 | + strategy=strategy_obj, |
| 200 | + runs=tuple(run_list), |
| 201 | + ) |
| 202 | + |
| 203 | + def _compute_leaderboard_entry(self, runs: tuple[Run, ...]) -> LeaderboardEntry: |
| 204 | + """Aggregate Runs into a LeaderboardEntry (base stats only).""" |
| 205 | + n_runs = len(runs) |
| 206 | + |
| 207 | + # Round statistics |
| 208 | + rounds = [r.final_round for r in runs] |
| 209 | + avg_round = sum(rounds) / n_runs |
| 210 | + std_round = statistics.stdev(rounds) if n_runs > 1 else 0.0 |
| 211 | + |
| 212 | + # Call statistics (sum across runs) |
| 213 | + calls_total = sum(r.stats.calls_total for r in runs) |
| 214 | + calls_success = sum(r.stats.calls_success for r in runs) |
| 215 | + calls_error = sum(r.stats.calls_error for r in runs) |
| 216 | + calls_failed = sum(r.stats.calls_failed for r in runs) |
| 217 | + |
| 218 | + # Token totals |
| 219 | + tokens_in_total = sum(r.stats.tokens_in_total for r in runs) |
| 220 | + tokens_out_total = sum(r.stats.tokens_out_total for r in runs) |
| 221 | + |
| 222 | + # Time and cost totals |
| 223 | + time_total_ms = sum(r.stats.time_total_ms for r in runs) |
| 224 | + cost_total = sum(r.stats.cost_total for r in runs) |
| 225 | + |
| 226 | + # Per-call averages (pooled across all runs) |
| 227 | + if calls_total > 0: |
| 228 | + tokens_in_avg = tokens_in_total / calls_total |
| 229 | + tokens_out_avg = tokens_out_total / calls_total |
| 230 | + time_avg_ms = time_total_ms / calls_total |
| 231 | + cost_avg = cost_total / calls_total |
| 232 | + else: |
| 233 | + tokens_in_avg = tokens_out_avg = time_avg_ms = cost_avg = 0.0 |
| 234 | + |
| 235 | + # Pooled standard deviations (computed from per-run stats) |
| 236 | + tokens_in_std = self._pooled_std_dev_from_runs( |
| 237 | + runs, "tokens_in_std", "tokens_in_avg", tokens_in_avg, calls_total |
| 238 | + ) |
| 239 | + tokens_out_std = self._pooled_std_dev_from_runs( |
| 240 | + runs, "tokens_out_std", "tokens_out_avg", tokens_out_avg, calls_total |
| 241 | + ) |
| 242 | + time_std_ms = self._pooled_std_dev_from_runs( |
| 243 | + runs, "time_std_ms", "time_avg_ms", time_avg_ms, calls_total |
| 244 | + ) |
| 245 | + cost_std = self._pooled_std_dev_from_runs( |
| 246 | + runs, "cost_std", "cost_avg", cost_avg, calls_total |
| 247 | + ) |
| 248 | + |
| 249 | + # Create aggregated Stats |
| 250 | + aggregated_stats = Stats( |
| 251 | + calls_total=calls_total, |
| 252 | + calls_success=calls_success, |
| 253 | + calls_error=calls_error, |
| 254 | + calls_failed=calls_failed, |
| 255 | + tokens_in_total=tokens_in_total, |
| 256 | + tokens_out_total=tokens_out_total, |
| 257 | + tokens_in_avg=tokens_in_avg, |
| 258 | + tokens_out_avg=tokens_out_avg, |
| 259 | + tokens_in_std=tokens_in_std, |
| 260 | + tokens_out_std=tokens_out_std, |
| 261 | + time_total_ms=time_total_ms, |
| 262 | + time_avg_ms=time_avg_ms, |
| 263 | + time_std_ms=time_std_ms, |
| 264 | + cost_total=cost_total, |
| 265 | + cost_avg=cost_avg, |
| 266 | + cost_std=cost_std, |
| 267 | + ) |
| 268 | + |
| 269 | + return LeaderboardEntry( |
| 270 | + run_count=n_runs, |
| 271 | + run_wins=sum(1 for r in runs if r.run_won), |
| 272 | + run_completed=sum(1 for r in runs if r.run_completed), |
| 273 | + avg_round=avg_round, |
| 274 | + std_round=std_round, |
| 275 | + stats=aggregated_stats, |
| 276 | + ) |
| 277 | + |
| 278 | + def _pooled_std_dev_from_runs( |
| 279 | + self, |
| 280 | + runs: tuple[Run, ...], |
| 281 | + std_attr: str, |
| 282 | + avg_attr: str, |
| 283 | + overall_mean: float, |
| 284 | + total_n: int, |
| 285 | + ) -> float: |
| 286 | + """Compute pooled standard deviation across multiple runs. |
| 287 | +
|
| 288 | + Uses the formula for pooled variance when combining samples with |
| 289 | + different means and standard deviations. |
| 290 | +
|
| 291 | + Args: |
| 292 | + runs: Tuple of Run objects |
| 293 | + std_attr: Attribute name for std dev on Run.stats (e.g., "tokens_in_std") |
| 294 | + avg_attr: Attribute name for average on Run.stats (e.g., "tokens_in_avg") |
| 295 | + overall_mean: The overall mean across all runs |
| 296 | + total_n: Total number of observations (calls) across all runs |
| 297 | + """ |
| 298 | + if total_n <= 1: |
| 299 | + return 0.0 |
| 300 | + |
| 301 | + numerator = 0.0 |
| 302 | + for run in runs: |
| 303 | + s_i = getattr(run.stats, std_attr) |
| 304 | + mean_i = getattr(run.stats, avg_attr) |
| 305 | + n_i = run.stats.calls_total |
| 306 | + |
| 307 | + numerator += (n_i - 1) * (s_i**2) + n_i * ((mean_i - overall_mean) ** 2) |
| 308 | + |
| 309 | + pooled_var = numerator / (total_n - 1) |
| 310 | + return pooled_var**0.5 |
| 311 | + |
| 312 | + def create_models_leaderboard( |
| 313 | + self, strategy: Strategy, runs_list: list[Runs] |
| 314 | + ) -> ModelsLeaderboard: |
| 315 | + """Create a ModelsLeaderboard from Runs list. |
| 316 | +
|
| 317 | + Compares different models using the same strategy. |
| 318 | + """ |
| 319 | + # Compute leaderboard entry for each Runs and pair with model |
| 320 | + entries_with_avg: list[tuple[ModelsLeaderboardEntry, float]] = [] |
| 321 | + for runs in runs_list: |
| 322 | + entry = self._compute_leaderboard_entry(runs.runs) |
| 323 | + model_entry = ModelsLeaderboardEntry( |
| 324 | + run_count=entry.run_count, |
| 325 | + run_wins=entry.run_wins, |
| 326 | + run_completed=entry.run_completed, |
| 327 | + avg_round=entry.avg_round, |
| 328 | + std_round=entry.std_round, |
| 329 | + stats=entry.stats, |
| 330 | + model=runs.model, |
| 331 | + ) |
| 332 | + entries_with_avg.append((model_entry, entry.avg_round)) |
| 333 | + |
| 334 | + # Sort by avg_round descending |
| 335 | + entries_with_avg.sort(key=lambda x: x[1], reverse=True) |
| 336 | + |
| 337 | + return ModelsLeaderboard( |
| 338 | + generated_at=int(time.time()), |
| 339 | + strategy=strategy, |
| 340 | + entries=tuple(e[0] for e in entries_with_avg), |
| 341 | + ) |
| 342 | + |
| 343 | + def create_strategies_leaderboard( |
| 344 | + self, model: Model, runs_list: list[Runs] |
| 345 | + ) -> StrategiesLeaderboard: |
| 346 | + """Create a StrategiesLeaderboard from Runs list. |
| 347 | +
|
| 348 | + Compares different strategies for the same model. |
| 349 | + """ |
| 350 | + # Compute leaderboard entry for each Runs and pair with strategy |
| 351 | + entries_with_avg: list[tuple[StrategiesLeaderboardEntry, float]] = [] |
| 352 | + for runs in runs_list: |
| 353 | + entry = self._compute_leaderboard_entry(runs.runs) |
| 354 | + strategy_entry = StrategiesLeaderboardEntry( |
| 355 | + run_count=entry.run_count, |
| 356 | + run_wins=entry.run_wins, |
| 357 | + run_completed=entry.run_completed, |
| 358 | + avg_round=entry.avg_round, |
| 359 | + std_round=entry.std_round, |
| 360 | + stats=entry.stats, |
| 361 | + strategy=runs.strategy, |
| 362 | + ) |
| 363 | + entries_with_avg.append((strategy_entry, entry.avg_round)) |
| 364 | + |
| 365 | + # Sort by avg_round descending |
| 366 | + entries_with_avg.sort(key=lambda x: x[1], reverse=True) |
| 367 | + |
| 368 | + return StrategiesLeaderboard( |
| 369 | + generated_at=int(time.time()), |
| 370 | + model=model, |
| 371 | + entries=tuple(e[0] for e in entries_with_avg), |
| 372 | + ) |
0 commit comments