1010
1111from tqdm import tqdm
1212
13- from balatrollm .config import Config
13+ from balatrollm .config import Config , StrategyManifest
1414from balatrollm .data_collection import Stats
1515
1616
@@ -47,6 +47,7 @@ class ModelStats:
4747 average : ModelAggregatedStats # avg over tool calls
4848 std_dev : ModelAggregatedStats # std dev over tool calls
4949 config : Config
50+ strategy : StrategyManifest | None
5051
5152
5253@dataclass
@@ -77,6 +78,100 @@ def analyze_all_runs(self) -> None:
7778 if strategy_dir .is_dir ():
7879 self .analyze_strategy_runs (strategy_dir )
7980
81+ def analyze_version_by_models (self , version_dir : Path ) -> None :
82+ """Analyze a single version by comparing models within each strategy.
83+
84+ Args:
85+ version_dir: Path to version directory (e.g., runs/v0.13.2)
86+ """
87+ if not version_dir .is_dir ():
88+ raise FileNotFoundError (f"Version directory not found: { version_dir } " )
89+
90+ for strategy_dir in version_dir .iterdir ():
91+ if strategy_dir .is_dir ():
92+ self .analyze_strategy_runs (strategy_dir )
93+
94+ def analyze_version_by_strategies (self , version_dir : Path ) -> None :
95+ """Analyze a single version by comparing strategies for each model.
96+
97+ Args:
98+ version_dir: Path to version directory (e.g., runs/v0.13.2)
99+ """
100+ if not version_dir .is_dir ():
101+ raise FileNotFoundError (f"Version directory not found: { version_dir } " )
102+
103+ # Collect all model_dirs with their strategy associations
104+ models_by_key : dict [
105+ str , list [tuple [Path , str ]]
106+ ] = {} # key -> [(model_dir, strategy)]
107+
108+ for strategy_dir in version_dir .iterdir ():
109+ if not strategy_dir .is_dir ():
110+ continue
111+ strategy_name = strategy_dir .name
112+
113+ for vendor_dir in strategy_dir .iterdir ():
114+ if not vendor_dir .is_dir ():
115+ continue
116+
117+ for model_dir in vendor_dir .iterdir ():
118+ if not model_dir .is_dir ():
119+ continue
120+
121+ # Create a unique key for this model (vendor/model)
122+ model_key = f"{ vendor_dir .name } /{ model_dir .name } "
123+ if model_key not in models_by_key :
124+ models_by_key [model_key ] = []
125+ models_by_key [model_key ].append ((model_dir , strategy_name ))
126+
127+ # For each model, analyze strategies
128+ for model_key , model_dirs_with_strategies in models_by_key .items ():
129+ vendor_name , model_name = model_key .split ("/" , 1 )
130+ self ._analyze_model_strategies (
131+ model_dirs_with_strategies , vendor_name , model_name
132+ )
133+
134+ def _analyze_model_strategies (
135+ self ,
136+ model_dirs_with_strategies : list [tuple [Path , str ]],
137+ vendor : str ,
138+ model : str ,
139+ ) -> None :
140+ """Analyze strategies for a specific model.
141+
142+ Args:
143+ model_dirs_with_strategies: List of (model_dir, strategy_name) tuples
144+ vendor: Vendor name (e.g., openrouter)
145+ model: Model name (e.g., openai/gpt-oss-20b)
146+ """
147+ models_stats = []
148+
149+ for model_dir , strategy_name in model_dirs_with_strategies :
150+ model_stats = self .compute_model_stats (model_dir )
151+ models_stats .append ((model_stats , strategy_name ))
152+
153+ # Save individual model stats
154+ output_dir = self .benchmark_dir / vendor / model / strategy_name
155+ model_stats_path = output_dir / "stats.json"
156+ model_stats_path .parent .mkdir (exist_ok = True , parents = True )
157+ stats_dict = asdict (model_stats )
158+ stats_dict ["config" ].pop ("seed" )
159+ with open (model_stats_path , "w" ) as f :
160+ json .dump (stats_dict , f , indent = 2 )
161+
162+ # Create detailed run directories
163+ self .create_detailed_run_dirs (model_dir , output_dir )
164+
165+ # Create leaderboard comparing strategies
166+ output_dir = self .benchmark_dir / vendor / model
167+ output_dir .mkdir (parents = True , exist_ok = True )
168+
169+ strategy_stats = [stats for stats , _ in models_stats ]
170+ leaderboard = self .compute_models_leaderboard (strategy_stats )
171+ leaderboard_path = output_dir / "leaderboard.json"
172+ with open (leaderboard_path , "w" ) as f :
173+ json .dump (asdict (leaderboard ), f , indent = 2 )
174+
80175 def analyze_strategy_runs (self , strategy_dir : Path ) -> None :
81176 models_stats = []
82177 output_dir = self .benchmark_dir / strategy_dir .relative_to (self .runs_dir )
@@ -111,6 +206,7 @@ def analyze_strategy_runs(self, strategy_dir: Path) -> None:
111206 def compute_model_stats (self , model_dir : Path ) -> ModelStatsFull :
112207 stats : list [Stats ] = []
113208 configs : list [Config ] = []
209+ strategies : list [StrategyManifest ] = []
114210 run_names : list [str ] = []
115211 for run_dir in model_dir .iterdir ():
116212 if not run_dir .is_dir ():
@@ -119,6 +215,7 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
119215 # Skip runs that don't have required files
120216 stats_file = run_dir / "stats.json"
121217 config_file = run_dir / "config.json"
218+ strategy_file = run_dir / "strategy.json"
122219 if not stats_file .exists () or not config_file .exists ():
123220 print (f"Skipping incomplete run: { run_dir .name } " )
124221 continue
@@ -127,9 +224,29 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
127224 stats .append (Stats .from_dict (json .load (f )))
128225 with open (config_file , "r" ) as f :
129226 configs .append (Config (** json .load (f )))
227+
228+ # Load strategy manifest if available
229+ if strategy_file .exists ():
230+ with open (strategy_file , "r" ) as f :
231+ strategy_data = json .load (f )
232+ strategies .append (StrategyManifest (** strategy_data ))
233+ else :
234+ # Fallback: try to load from strategies directory
235+ try :
236+ strategies .append (
237+ StrategyManifest .from_manifest_file (configs [- 1 ].strategy )
238+ )
239+ except FileNotFoundError :
240+ print (
241+ f"Warning: Could not find strategy manifest for { configs [- 1 ].strategy } "
242+ )
243+ continue
244+
130245 run_names .append (run_dir .name )
131246
132247 config = configs [0 ]
248+ strategy = strategies [0 ] if strategies else None
249+
133250 for c in configs [1 :]:
134251 # Compare all fields except seed (seeds can differ for multi-seed runs)
135252 assert (
@@ -140,6 +257,17 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
140257 and c .challenge == config .challenge
141258 ), f"Configs differ (excluding seed): { c } vs { config } "
142259
260+ # Validate that all strategy manifests are identical
261+ if strategy is not None :
262+ for s in strategies [1 :]:
263+ assert (
264+ s .name == strategy .name
265+ and s .description == strategy .description
266+ and s .author == strategy .author
267+ and s .version == strategy .version
268+ and s .tags == strategy .tags
269+ ), f"Strategy manifests differ: { s } vs { strategy } "
270+
143271 avg_final_round = sum (stat .final_round for stat in stats ) / len (stats )
144272 std_final_round = statistics .stdev (stat .final_round for stat in stats )
145273
@@ -216,6 +344,7 @@ def tot_std_dev(attr: str) -> float:
216344 std_dev = std_dev ,
217345 stats = stats ,
218346 config = config ,
347+ strategy = strategy ,
219348 )
220349
221350 def compute_models_leaderboard (
@@ -401,3 +530,46 @@ def create_detailed_run_dirs(self, model_dir: Path, output_dir: Path) -> None:
401530 if png_file .exists ():
402531 screenshot_dest = custom_id_dir / "screenshot.png"
403532 screenshot_dest .write_bytes (png_file .read_bytes ())
533+
534+ def generate_manifest (self , base_dir : Path , current_version : str ) -> None :
535+ """Generate manifest.json tracking available benchmark versions.
536+
537+ Scans base_dir for all version directories (v*.*.* pattern),
538+ sorts them semantically in descending order, and marks the
539+ current version as latest.
540+
541+ Args:
542+ base_dir: Base directory containing version subdirectories
543+ (e.g., benchmarks/models or benchmarks/strategies)
544+ current_version: Current version string from __version__
545+ """
546+ if not base_dir .exists ():
547+ base_dir .mkdir (parents = True , exist_ok = True )
548+
549+ # Find all version directories
550+ versions = []
551+ version_pattern = re .compile (r"^v(\d+)\.(\d+)\.(\d+)$" )
552+
553+ for item in base_dir .iterdir ():
554+ if item .is_dir ():
555+ match = version_pattern .match (item .name )
556+ if match :
557+ major , minor , patch = map (int , match .groups ())
558+ versions .append ((item .name , (major , minor , patch )))
559+
560+ # Sort versions in descending order (newest first)
561+ versions .sort (key = lambda x : x [1 ], reverse = True )
562+
563+ # Create manifest entries
564+ manifest_entries = []
565+ for version_str , _ in versions :
566+ entry = {"version" : version_str }
567+ if version_str == f"v{ current_version } " :
568+ entry ["latest" ] = True
569+ manifest_entries .append (entry )
570+
571+ # Write manifest.json
572+ manifest = {"versions" : manifest_entries }
573+ manifest_path = base_dir / "manifest.json"
574+ with open (manifest_path , "w" ) as f :
575+ json .dump (manifest , f , indent = 2 )
0 commit comments