feat: add strategy metadata to benchmark

S1M0N38 · S1M0N38 · commit 88ebc44b4aad · 2025-10-25T01:03:58.000+02:00
diff --git a/src/balatrollm/__init__.py b/src/balatrollm/__init__.py
@@ -2,6 +2,6 @@
 
 __version__ = "0.13.2"
 
-from .cli import main
+from .balatrollm_cli import main
 
 __all__ = ["main"]
diff --git a/src/balatrollm/benchmark.py b/src/balatrollm/benchmark.py
@@ -10,7 +10,7 @@
 
 from tqdm import tqdm
 
-from balatrollm.config import Config
+from balatrollm.config import Config, StrategyManifest
 from balatrollm.data_collection import Stats
 
 
@@ -47,6 +47,7 @@ class ModelStats:
     average: ModelAggregatedStats  # avg over tool calls
     std_dev: ModelAggregatedStats  # std dev over tool calls
     config: Config
+    strategy: StrategyManifest | None
 
 
 @dataclass
@@ -77,6 +78,100 @@ def analyze_all_runs(self) -> None:
                         if strategy_dir.is_dir():
                             self.analyze_strategy_runs(strategy_dir)
 
+    def analyze_version_by_models(self, version_dir: Path) -> None:
+        """Analyze a single version by comparing models within each strategy.
+
+        Args:
+            version_dir: Path to version directory (e.g., runs/v0.13.2)
+        """
+        if not version_dir.is_dir():
+            raise FileNotFoundError(f"Version directory not found: {version_dir}")
+
+        for strategy_dir in version_dir.iterdir():
+            if strategy_dir.is_dir():
+                self.analyze_strategy_runs(strategy_dir)
+
+    def analyze_version_by_strategies(self, version_dir: Path) -> None:
+        """Analyze a single version by comparing strategies for each model.
+
+        Args:
+            version_dir: Path to version directory (e.g., runs/v0.13.2)
+        """
+        if not version_dir.is_dir():
+            raise FileNotFoundError(f"Version directory not found: {version_dir}")
+
+        # Collect all model_dirs with their strategy associations
+        models_by_key: dict[
+            str, list[tuple[Path, str]]
+        ] = {}  # key -> [(model_dir, strategy)]
+
+        for strategy_dir in version_dir.iterdir():
+            if not strategy_dir.is_dir():
+                continue
+            strategy_name = strategy_dir.name
+
+            for vendor_dir in strategy_dir.iterdir():
+                if not vendor_dir.is_dir():
+                    continue
+
+                for model_dir in vendor_dir.iterdir():
+                    if not model_dir.is_dir():
+                        continue
+
+                    # Create a unique key for this model (vendor/model)
+                    model_key = f"{vendor_dir.name}/{model_dir.name}"
+                    if model_key not in models_by_key:
+                        models_by_key[model_key] = []
+                    models_by_key[model_key].append((model_dir, strategy_name))
+
+        # For each model, analyze strategies
+        for model_key, model_dirs_with_strategies in models_by_key.items():
+            vendor_name, model_name = model_key.split("/", 1)
+            self._analyze_model_strategies(
+                model_dirs_with_strategies, vendor_name, model_name
+            )
+
+    def _analyze_model_strategies(
+        self,
+        model_dirs_with_strategies: list[tuple[Path, str]],
+        vendor: str,
+        model: str,
+    ) -> None:
+        """Analyze strategies for a specific model.
+
+        Args:
+            model_dirs_with_strategies: List of (model_dir, strategy_name) tuples
+            vendor: Vendor name (e.g., openrouter)
+            model: Model name (e.g., openai/gpt-oss-20b)
+        """
+        models_stats = []
+
+        for model_dir, strategy_name in model_dirs_with_strategies:
+            model_stats = self.compute_model_stats(model_dir)
+            models_stats.append((model_stats, strategy_name))
+
+            # Save individual model stats
+            output_dir = self.benchmark_dir / vendor / model / strategy_name
+            model_stats_path = output_dir / "stats.json"
+            model_stats_path.parent.mkdir(exist_ok=True, parents=True)
+            stats_dict = asdict(model_stats)
+            stats_dict["config"].pop("seed")
+            with open(model_stats_path, "w") as f:
+                json.dump(stats_dict, f, indent=2)
+
+            # Create detailed run directories
+            self.create_detailed_run_dirs(model_dir, output_dir)
+
+        # Create leaderboard comparing strategies
+        output_dir = self.benchmark_dir / vendor / model
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        strategy_stats = [stats for stats, _ in models_stats]
+        leaderboard = self.compute_models_leaderboard(strategy_stats)
+        leaderboard_path = output_dir / "leaderboard.json"
+        with open(leaderboard_path, "w") as f:
+            json.dump(asdict(leaderboard), f, indent=2)
+
     def analyze_strategy_runs(self, strategy_dir: Path) -> None:
         models_stats = []
         output_dir = self.benchmark_dir / strategy_dir.relative_to(self.runs_dir)
@@ -111,6 +206,7 @@ def analyze_strategy_runs(self, strategy_dir: Path) -> None:
     def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
         stats: list[Stats] = []
         configs: list[Config] = []
+        strategies: list[StrategyManifest] = []
         run_names: list[str] = []
         for run_dir in model_dir.iterdir():
             if not run_dir.is_dir():
@@ -119,6 +215,7 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
             # Skip runs that don't have required files
             stats_file = run_dir / "stats.json"
             config_file = run_dir / "config.json"
+            strategy_file = run_dir / "strategy.json"
             if not stats_file.exists() or not config_file.exists():
                 print(f"Skipping incomplete run: {run_dir.name}")
                 continue
@@ -127,9 +224,29 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
                 stats.append(Stats.from_dict(json.load(f)))
             with open(config_file, "r") as f:
                 configs.append(Config(**json.load(f)))
+
+            # Load strategy manifest if available
+            if strategy_file.exists():
+                with open(strategy_file, "r") as f:
+                    strategy_data = json.load(f)
+                    strategies.append(StrategyManifest(**strategy_data))
+            else:
+                # Fallback: try to load from strategies directory
+                try:
+                    strategies.append(
+                        StrategyManifest.from_manifest_file(configs[-1].strategy)
+                    )
+                except FileNotFoundError:
+                    print(
+                        f"Warning: Could not find strategy manifest for {configs[-1].strategy}"
+                    )
+                    continue
+
             run_names.append(run_dir.name)
 
         config = configs[0]
+        strategy = strategies[0] if strategies else None
+
         for c in configs[1:]:
             # Compare all fields except seed (seeds can differ for multi-seed runs)
             assert (
@@ -140,6 +257,17 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
                 and c.challenge == config.challenge
             ), f"Configs differ (excluding seed): {c} vs {config}"
 
+        # Validate that all strategy manifests are identical
+        if strategy is not None:
+            for s in strategies[1:]:
+                assert (
+                    s.name == strategy.name
+                    and s.description == strategy.description
+                    and s.author == strategy.author
+                    and s.version == strategy.version
+                    and s.tags == strategy.tags
+                ), f"Strategy manifests differ: {s} vs {strategy}"
+
         avg_final_round = sum(stat.final_round for stat in stats) / len(stats)
         std_final_round = statistics.stdev(stat.final_round for stat in stats)
 
@@ -216,6 +344,7 @@ def tot_std_dev(attr: str) -> float:
             std_dev=std_dev,
             stats=stats,
             config=config,
+            strategy=strategy,
         )
 
     def compute_models_leaderboard(
@@ -401,3 +530,46 @@ def create_detailed_run_dirs(self, model_dir: Path, output_dir: Path) -> None:
                     if png_file.exists():
                         screenshot_dest = custom_id_dir / "screenshot.png"
                         screenshot_dest.write_bytes(png_file.read_bytes())
+
+    def generate_manifest(self, base_dir: Path, current_version: str) -> None:
+        """Generate manifest.json tracking available benchmark versions.
+
+        Scans base_dir for all version directories (v*.*.* pattern),
+        sorts them semantically in descending order, and marks the
+        current version as latest.
+
+        Args:
+            base_dir: Base directory containing version subdirectories
+                     (e.g., benchmarks/models or benchmarks/strategies)
+            current_version: Current version string from __version__
+        """
+        if not base_dir.exists():
+            base_dir.mkdir(parents=True, exist_ok=True)
+
+        # Find all version directories
+        versions = []
+        version_pattern = re.compile(r"^v(\d+)\.(\d+)\.(\d+)$")
+
+        for item in base_dir.iterdir():
+            if item.is_dir():
+                match = version_pattern.match(item.name)
+                if match:
+                    major, minor, patch = map(int, match.groups())
+                    versions.append((item.name, (major, minor, patch)))
+
+        # Sort versions in descending order (newest first)
+        versions.sort(key=lambda x: x[1], reverse=True)
+
+        # Create manifest entries
+        manifest_entries = []
+        for version_str, _ in versions:
+            entry = {"version": version_str}
+            if version_str == f"v{current_version}":
+                entry["latest"] = True
+            manifest_entries.append(entry)
+
+        # Write manifest.json
+        manifest = {"versions": manifest_entries}
+        manifest_path = base_dir / "manifest.json"
+        with open(manifest_path, "w") as f:
+            json.dump(manifest, f, indent=2)