Skip to content

Commit e93dcf7

Browse files
S1M0N38claude
andcommitted
docs: add comprehensive documentation to benchmark system
- Add detailed docstrings to AveragedStats, RunsData, and BenchmarkAnalyzer - Document all class attributes and method parameters - Improve analysis method documentation with error handling - Add comprehensive docstrings for leaderboard generation functions 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 48e7cdf commit e93dcf7

1 file changed

Lines changed: 111 additions & 17 deletions

File tree

src/balatrollm/benchmark.py

Lines changed: 111 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,28 @@
1313

1414
@dataclass
1515
class AveragedStats:
16-
"""Simple container for averaged stats."""
16+
"""Simple container for averaged stats.
17+
18+
Stores averaged performance metrics calculated across multiple runs
19+
with the same configuration.
20+
21+
Attributes:
22+
avg_final_round: Average final round reached across runs.
23+
avg_ante_reached: Average ante level reached across runs.
24+
avg_jokers_bought: Average number of jokers bought per run.
25+
avg_jokers_sold: Average number of jokers sold per run.
26+
avg_consumables_used: Average number of consumables used per run.
27+
avg_rerolls: Average number of shop rerolls per run.
28+
avg_money_spent: Average money spent per run.
29+
avg_successful_calls: Average successful LLM calls per run.
30+
avg_error_calls: Average number of error calls per run.
31+
avg_failed_calls: Average number of failed calls per run.
32+
avg_total_input_tokens: Average total input tokens per run.
33+
avg_total_output_tokens: Average total output tokens per run.
34+
avg_total_reasoning_tokens: Average total reasoning tokens per run.
35+
avg_total_tokens: Average total tokens per run.
36+
avg_total_response_time_ms: Average total response time per run.
37+
"""
1738

1839
avg_final_round: float
1940
avg_ante_reached: float
@@ -37,6 +58,20 @@ class AveragedStats:
3758

3859
@dataclass
3960
class RunsData:
61+
"""Aggregated data for multiple runs with identical configuration.
62+
63+
Combines statistics from multiple runs sharing the same model,
64+
strategy, and version for comparative analysis.
65+
66+
Attributes:
67+
config: Shared configuration across all runs.
68+
total_runs: Total number of runs in this group.
69+
completed_runs: Number of runs that completed (won or game over).
70+
won_runs: Number of runs that were won.
71+
averaged_stats: Averaged statistics across all runs.
72+
stats: List of individual run statistics.
73+
"""
74+
4075
config: Config
4176

4277
# Run statistics
@@ -52,14 +87,34 @@ class RunsData:
5287

5388

5489
class BenchmarkAnalyzer:
55-
"""Analyzes BalatroLLM runs and generates leaderboards."""
90+
"""Analyzes BalatroLLM runs and generates leaderboards.
91+
92+
Processes structured run data to generate comprehensive performance
93+
analysis and rankings organized by version and strategy.
94+
95+
Attributes:
96+
runs_dir: Directory containing run data to analyze.
97+
aggregated_data: Dictionary mapping config keys to aggregated run data.
98+
"""
5699

57100
def __init__(self, runs_dir: Path = Path("runs")):
101+
"""Initialize the benchmark analyzer.
102+
103+
Args:
104+
runs_dir: Directory containing run data to analyze (default: 'runs').
105+
"""
58106
self.runs_dir = runs_dir
59107
self.aggregated_data: dict[str, RunsData] = {}
60108

61109
def analyze_all_runs(self) -> None:
62-
"""Analyze all runs in the runs directory."""
110+
"""Analyze all runs in the runs directory.
111+
112+
Processes all run data in the directory structure and aggregates
113+
statistics by configuration.
114+
115+
Raises:
116+
FileNotFoundError: If the runs directory doesn't exist.
117+
"""
63118
print("Analyzing all runs...")
64119

65120
if not self.runs_dir.exists():
@@ -81,29 +136,29 @@ def analyze_all_runs(self) -> None:
81136
def _analyze_runs(
82137
self, version_dir: Path, version: str
83138
) -> list[tuple[RunStats, Config]]:
84-
"""Analyze runs using the directory structure: version/strategy/provider/model/run"""
139+
"""Analyze runs using the directory structure: version/strategy/provider/model/run.
140+
141+
Args:
142+
version_dir: Directory containing runs for a specific version.
143+
version: Version string for this directory.
144+
145+
Returns:
146+
List of tuples containing (RunStats, Config) for each valid run.
147+
"""
85148
run_stats = []
86149

87150
for strategy_dir in version_dir.iterdir():
88151
if not strategy_dir.is_dir():
89152
continue
90153

91-
strategy = strategy_dir.name
92-
93154
for provider_dir in strategy_dir.iterdir():
94155
if not provider_dir.is_dir():
95156
continue
96157

97-
provider = provider_dir.name
98-
99158
for model_dir in provider_dir.iterdir():
100159
if not model_dir.is_dir():
101160
continue
102161

103-
model_name = model_dir.name
104-
# Combine provider and model for full model identifier
105-
full_model = f"{provider}/{model_name}"
106-
107162
for run_dir in model_dir.iterdir():
108163
if not run_dir.is_dir():
109164
continue
@@ -115,7 +170,14 @@ def _analyze_runs(
115170
return run_stats
116171

117172
def _load_run_data(self, run_dir: Path) -> tuple[RunStats, Config] | None:
118-
"""Load data from a single run directory."""
173+
"""Load data from a single run directory.
174+
175+
Args:
176+
run_dir: Directory containing a single run's data files.
177+
178+
Returns:
179+
Tuple of (RunStats, Config) if loading succeeds, None if it fails.
180+
"""
119181
config_file = run_dir / "config.json"
120182
stats_file = run_dir / "stats.json"
121183

@@ -138,7 +200,14 @@ def _load_run_data(self, run_dir: Path) -> tuple[RunStats, Config] | None:
138200
return None
139201

140202
def _aggregate_data(self, all_run_stats: list[tuple[RunStats, Config]]) -> None:
141-
"""Aggregate data by identical config."""
203+
"""Aggregate data by identical config.
204+
205+
Groups runs by identical configuration and calculates averaged statistics
206+
for each unique configuration.
207+
208+
Args:
209+
all_run_stats: List of (RunStats, Config) tuples from all runs.
210+
"""
142211
print("Aggregating data...")
143212

144213
# Group runs by identical config content (excluding run-specific fields)
@@ -181,7 +250,17 @@ def _aggregate_data(self, all_run_stats: list[tuple[RunStats, Config]]) -> None:
181250
print(f"Aggregated {len(self.aggregated_data)} unique configurations")
182251

183252
def _calculate_averaged_stats(self, stats_list: list[RunStats]) -> AveragedStats:
184-
"""Calculate averaged stats using the AveragedStats dataclass."""
253+
"""Calculate averaged stats using the AveragedStats dataclass.
254+
255+
Args:
256+
stats_list: List of RunStats to average.
257+
258+
Returns:
259+
AveragedStats containing averaged values across all runs.
260+
261+
Raises:
262+
ValueError: If stats_list is empty.
263+
"""
185264
if not stats_list:
186265
raise ValueError("Cannot calculate averages for empty stats list")
187266

@@ -222,7 +301,14 @@ def avg_len(field_name: str) -> float:
222301
)
223302

224303
def generate_leaderboard(self, output_dir: Path = Path("benchmarks")) -> None:
225-
"""Generate leaderboard and detailed analysis files organized by version/strategy."""
304+
"""Generate leaderboard and detailed analysis files organized by version/strategy.
305+
306+
Creates hierarchical benchmark results with strategy-specific leaderboards
307+
and individual model performance files.
308+
309+
Args:
310+
output_dir: Directory to write benchmark results (default: 'benchmarks').
311+
"""
226312
print("Generating leaderboard...")
227313

228314
# Check if output directory exists and ask for confirmation
@@ -371,7 +457,15 @@ def generate_leaderboard(self, output_dir: Path = Path("benchmarks")) -> None:
371457
def run_benchmark_analysis(
372458
runs_dir: Path = Path("runs"), output_dir: Path = Path("benchmarks")
373459
) -> None:
374-
"""Analyze BalatroLLM runs and generate comprehensive leaderboards."""
460+
"""Analyze BalatroLLM runs and generate comprehensive leaderboards.
461+
462+
Main entry point for benchmark analysis that creates analyzer and
463+
processes all run data.
464+
465+
Args:
466+
runs_dir: Directory containing run data to analyze (default: 'runs').
467+
output_dir: Directory to write benchmark results (default: 'benchmarks').
468+
"""
375469
print("BalatroLLM Benchmark Analyzer")
376470
print(f"Analyzing runs in: {runs_dir}")
377471
print(f"Output directory: {output_dir}")

0 commit comments

Comments
 (0)