1313
1414@dataclass
1515class AveragedStats :
16- """Simple container for averaged stats."""
16+ """Simple container for averaged stats.
17+
18+ Stores averaged performance metrics calculated across multiple runs
19+ with the same configuration.
20+
21+ Attributes:
22+ avg_final_round: Average final round reached across runs.
23+ avg_ante_reached: Average ante level reached across runs.
24+ avg_jokers_bought: Average number of jokers bought per run.
25+ avg_jokers_sold: Average number of jokers sold per run.
26+ avg_consumables_used: Average number of consumables used per run.
27+ avg_rerolls: Average number of shop rerolls per run.
28+ avg_money_spent: Average money spent per run.
29+ avg_successful_calls: Average successful LLM calls per run.
30+ avg_error_calls: Average number of error calls per run.
31+ avg_failed_calls: Average number of failed calls per run.
32+ avg_total_input_tokens: Average total input tokens per run.
33+ avg_total_output_tokens: Average total output tokens per run.
34+ avg_total_reasoning_tokens: Average total reasoning tokens per run.
35+ avg_total_tokens: Average total tokens per run.
36+ avg_total_response_time_ms: Average total response time per run.
37+ """
1738
1839 avg_final_round : float
1940 avg_ante_reached : float
@@ -37,6 +58,20 @@ class AveragedStats:
3758
3859@dataclass
3960class RunsData :
61+ """Aggregated data for multiple runs with identical configuration.
62+
63+ Combines statistics from multiple runs sharing the same model,
64+ strategy, and version for comparative analysis.
65+
66+ Attributes:
67+ config: Shared configuration across all runs.
68+ total_runs: Total number of runs in this group.
69+ completed_runs: Number of runs that completed (won or game over).
70+ won_runs: Number of runs that were won.
71+ averaged_stats: Averaged statistics across all runs.
72+ stats: List of individual run statistics.
73+ """
74+
4075 config : Config
4176
4277 # Run statistics
@@ -52,14 +87,34 @@ class RunsData:
5287
5388
5489class BenchmarkAnalyzer :
55- """Analyzes BalatroLLM runs and generates leaderboards."""
90+ """Analyzes BalatroLLM runs and generates leaderboards.
91+
92+ Processes structured run data to generate comprehensive performance
93+ analysis and rankings organized by version and strategy.
94+
95+ Attributes:
96+ runs_dir: Directory containing run data to analyze.
97+ aggregated_data: Dictionary mapping config keys to aggregated run data.
98+ """
5699
57100 def __init__ (self , runs_dir : Path = Path ("runs" )):
101+ """Initialize the benchmark analyzer.
102+
103+ Args:
104+ runs_dir: Directory containing run data to analyze (default: 'runs').
105+ """
58106 self .runs_dir = runs_dir
59107 self .aggregated_data : dict [str , RunsData ] = {}
60108
61109 def analyze_all_runs (self ) -> None :
62- """Analyze all runs in the runs directory."""
110+ """Analyze all runs in the runs directory.
111+
112+ Processes all run data in the directory structure and aggregates
113+ statistics by configuration.
114+
115+ Raises:
116+ FileNotFoundError: If the runs directory doesn't exist.
117+ """
63118 print ("Analyzing all runs..." )
64119
65120 if not self .runs_dir .exists ():
@@ -81,29 +136,29 @@ def analyze_all_runs(self) -> None:
81136 def _analyze_runs (
82137 self , version_dir : Path , version : str
83138 ) -> list [tuple [RunStats , Config ]]:
84- """Analyze runs using the directory structure: version/strategy/provider/model/run"""
139+ """Analyze runs using the directory structure: version/strategy/provider/model/run.
140+
141+ Args:
142+ version_dir: Directory containing runs for a specific version.
143+ version: Version string for this directory.
144+
145+ Returns:
146+ List of tuples containing (RunStats, Config) for each valid run.
147+ """
85148 run_stats = []
86149
87150 for strategy_dir in version_dir .iterdir ():
88151 if not strategy_dir .is_dir ():
89152 continue
90153
91- strategy = strategy_dir .name
92-
93154 for provider_dir in strategy_dir .iterdir ():
94155 if not provider_dir .is_dir ():
95156 continue
96157
97- provider = provider_dir .name
98-
99158 for model_dir in provider_dir .iterdir ():
100159 if not model_dir .is_dir ():
101160 continue
102161
103- model_name = model_dir .name
104- # Combine provider and model for full model identifier
105- full_model = f"{ provider } /{ model_name } "
106-
107162 for run_dir in model_dir .iterdir ():
108163 if not run_dir .is_dir ():
109164 continue
@@ -115,7 +170,14 @@ def _analyze_runs(
115170 return run_stats
116171
117172 def _load_run_data (self , run_dir : Path ) -> tuple [RunStats , Config ] | None :
118- """Load data from a single run directory."""
173+ """Load data from a single run directory.
174+
175+ Args:
176+ run_dir: Directory containing a single run's data files.
177+
178+ Returns:
179+ Tuple of (RunStats, Config) if loading succeeds, None if it fails.
180+ """
119181 config_file = run_dir / "config.json"
120182 stats_file = run_dir / "stats.json"
121183
@@ -138,7 +200,14 @@ def _load_run_data(self, run_dir: Path) -> tuple[RunStats, Config] | None:
138200 return None
139201
140202 def _aggregate_data (self , all_run_stats : list [tuple [RunStats , Config ]]) -> None :
141- """Aggregate data by identical config."""
203+ """Aggregate data by identical config.
204+
205+ Groups runs by identical configuration and calculates averaged statistics
206+ for each unique configuration.
207+
208+ Args:
209+ all_run_stats: List of (RunStats, Config) tuples from all runs.
210+ """
142211 print ("Aggregating data..." )
143212
144213 # Group runs by identical config content (excluding run-specific fields)
@@ -181,7 +250,17 @@ def _aggregate_data(self, all_run_stats: list[tuple[RunStats, Config]]) -> None:
181250 print (f"Aggregated { len (self .aggregated_data )} unique configurations" )
182251
183252 def _calculate_averaged_stats (self , stats_list : list [RunStats ]) -> AveragedStats :
184- """Calculate averaged stats using the AveragedStats dataclass."""
253+ """Calculate averaged stats using the AveragedStats dataclass.
254+
255+ Args:
256+ stats_list: List of RunStats to average.
257+
258+ Returns:
259+ AveragedStats containing averaged values across all runs.
260+
261+ Raises:
262+ ValueError: If stats_list is empty.
263+ """
185264 if not stats_list :
186265 raise ValueError ("Cannot calculate averages for empty stats list" )
187266
@@ -222,7 +301,14 @@ def avg_len(field_name: str) -> float:
222301 )
223302
224303 def generate_leaderboard (self , output_dir : Path = Path ("benchmarks" )) -> None :
225- """Generate leaderboard and detailed analysis files organized by version/strategy."""
304+ """Generate leaderboard and detailed analysis files organized by version/strategy.
305+
306+ Creates hierarchical benchmark results with strategy-specific leaderboards
307+ and individual model performance files.
308+
309+ Args:
310+ output_dir: Directory to write benchmark results (default: 'benchmarks').
311+ """
226312 print ("Generating leaderboard..." )
227313
228314 # Check if output directory exists and ask for confirmation
@@ -371,7 +457,15 @@ def generate_leaderboard(self, output_dir: Path = Path("benchmarks")) -> None:
371457def run_benchmark_analysis (
372458 runs_dir : Path = Path ("runs" ), output_dir : Path = Path ("benchmarks" )
373459) -> None :
374- """Analyze BalatroLLM runs and generate comprehensive leaderboards."""
460+ """Analyze BalatroLLM runs and generate comprehensive leaderboards.
461+
462+ Main entry point for benchmark analysis that creates analyzer and
463+ processes all run data.
464+
465+ Args:
466+ runs_dir: Directory containing run data to analyze (default: 'runs').
467+ output_dir: Directory to write benchmark results (default: 'benchmarks').
468+ """
375469 print ("BalatroLLM Benchmark Analyzer" )
376470 print (f"Analyzing runs in: { runs_dir } " )
377471 print (f"Output directory: { output_dir } " )
0 commit comments