Skip to content

Commit 88ebc44

Browse files
committed
feat: add strategy metadata to benchmark
1 parent b9a1284 commit 88ebc44

2 files changed

Lines changed: 174 additions & 2 deletions

File tree

src/balatrollm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@
22

33
__version__ = "0.13.2"
44

5-
from .cli import main
5+
from .balatrollm_cli import main
66

77
__all__ = ["main"]

src/balatrollm/benchmark.py

Lines changed: 173 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from tqdm import tqdm
1212

13-
from balatrollm.config import Config
13+
from balatrollm.config import Config, StrategyManifest
1414
from balatrollm.data_collection import Stats
1515

1616

@@ -47,6 +47,7 @@ class ModelStats:
4747
average: ModelAggregatedStats # avg over tool calls
4848
std_dev: ModelAggregatedStats # std dev over tool calls
4949
config: Config
50+
strategy: StrategyManifest | None
5051

5152

5253
@dataclass
@@ -77,6 +78,100 @@ def analyze_all_runs(self) -> None:
7778
if strategy_dir.is_dir():
7879
self.analyze_strategy_runs(strategy_dir)
7980

81+
def analyze_version_by_models(self, version_dir: Path) -> None:
82+
"""Analyze a single version by comparing models within each strategy.
83+
84+
Args:
85+
version_dir: Path to version directory (e.g., runs/v0.13.2)
86+
"""
87+
if not version_dir.is_dir():
88+
raise FileNotFoundError(f"Version directory not found: {version_dir}")
89+
90+
for strategy_dir in version_dir.iterdir():
91+
if strategy_dir.is_dir():
92+
self.analyze_strategy_runs(strategy_dir)
93+
94+
def analyze_version_by_strategies(self, version_dir: Path) -> None:
95+
"""Analyze a single version by comparing strategies for each model.
96+
97+
Args:
98+
version_dir: Path to version directory (e.g., runs/v0.13.2)
99+
"""
100+
if not version_dir.is_dir():
101+
raise FileNotFoundError(f"Version directory not found: {version_dir}")
102+
103+
# Collect all model_dirs with their strategy associations
104+
models_by_key: dict[
105+
str, list[tuple[Path, str]]
106+
] = {} # key -> [(model_dir, strategy)]
107+
108+
for strategy_dir in version_dir.iterdir():
109+
if not strategy_dir.is_dir():
110+
continue
111+
strategy_name = strategy_dir.name
112+
113+
for vendor_dir in strategy_dir.iterdir():
114+
if not vendor_dir.is_dir():
115+
continue
116+
117+
for model_dir in vendor_dir.iterdir():
118+
if not model_dir.is_dir():
119+
continue
120+
121+
# Create a unique key for this model (vendor/model)
122+
model_key = f"{vendor_dir.name}/{model_dir.name}"
123+
if model_key not in models_by_key:
124+
models_by_key[model_key] = []
125+
models_by_key[model_key].append((model_dir, strategy_name))
126+
127+
# For each model, analyze strategies
128+
for model_key, model_dirs_with_strategies in models_by_key.items():
129+
vendor_name, model_name = model_key.split("/", 1)
130+
self._analyze_model_strategies(
131+
model_dirs_with_strategies, vendor_name, model_name
132+
)
133+
134+
def _analyze_model_strategies(
135+
self,
136+
model_dirs_with_strategies: list[tuple[Path, str]],
137+
vendor: str,
138+
model: str,
139+
) -> None:
140+
"""Analyze strategies for a specific model.
141+
142+
Args:
143+
model_dirs_with_strategies: List of (model_dir, strategy_name) tuples
144+
vendor: Vendor name (e.g., openrouter)
145+
model: Model name (e.g., openai/gpt-oss-20b)
146+
"""
147+
models_stats = []
148+
149+
for model_dir, strategy_name in model_dirs_with_strategies:
150+
model_stats = self.compute_model_stats(model_dir)
151+
models_stats.append((model_stats, strategy_name))
152+
153+
# Save individual model stats
154+
output_dir = self.benchmark_dir / vendor / model / strategy_name
155+
model_stats_path = output_dir / "stats.json"
156+
model_stats_path.parent.mkdir(exist_ok=True, parents=True)
157+
stats_dict = asdict(model_stats)
158+
stats_dict["config"].pop("seed")
159+
with open(model_stats_path, "w") as f:
160+
json.dump(stats_dict, f, indent=2)
161+
162+
# Create detailed run directories
163+
self.create_detailed_run_dirs(model_dir, output_dir)
164+
165+
# Create leaderboard comparing strategies
166+
output_dir = self.benchmark_dir / vendor / model
167+
output_dir.mkdir(parents=True, exist_ok=True)
168+
169+
strategy_stats = [stats for stats, _ in models_stats]
170+
leaderboard = self.compute_models_leaderboard(strategy_stats)
171+
leaderboard_path = output_dir / "leaderboard.json"
172+
with open(leaderboard_path, "w") as f:
173+
json.dump(asdict(leaderboard), f, indent=2)
174+
80175
def analyze_strategy_runs(self, strategy_dir: Path) -> None:
81176
models_stats = []
82177
output_dir = self.benchmark_dir / strategy_dir.relative_to(self.runs_dir)
@@ -111,6 +206,7 @@ def analyze_strategy_runs(self, strategy_dir: Path) -> None:
111206
def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
112207
stats: list[Stats] = []
113208
configs: list[Config] = []
209+
strategies: list[StrategyManifest] = []
114210
run_names: list[str] = []
115211
for run_dir in model_dir.iterdir():
116212
if not run_dir.is_dir():
@@ -119,6 +215,7 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
119215
# Skip runs that don't have required files
120216
stats_file = run_dir / "stats.json"
121217
config_file = run_dir / "config.json"
218+
strategy_file = run_dir / "strategy.json"
122219
if not stats_file.exists() or not config_file.exists():
123220
print(f"Skipping incomplete run: {run_dir.name}")
124221
continue
@@ -127,9 +224,29 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
127224
stats.append(Stats.from_dict(json.load(f)))
128225
with open(config_file, "r") as f:
129226
configs.append(Config(**json.load(f)))
227+
228+
# Load strategy manifest if available
229+
if strategy_file.exists():
230+
with open(strategy_file, "r") as f:
231+
strategy_data = json.load(f)
232+
strategies.append(StrategyManifest(**strategy_data))
233+
else:
234+
# Fallback: try to load from strategies directory
235+
try:
236+
strategies.append(
237+
StrategyManifest.from_manifest_file(configs[-1].strategy)
238+
)
239+
except FileNotFoundError:
240+
print(
241+
f"Warning: Could not find strategy manifest for {configs[-1].strategy}"
242+
)
243+
continue
244+
130245
run_names.append(run_dir.name)
131246

132247
config = configs[0]
248+
strategy = strategies[0] if strategies else None
249+
133250
for c in configs[1:]:
134251
# Compare all fields except seed (seeds can differ for multi-seed runs)
135252
assert (
@@ -140,6 +257,17 @@ def compute_model_stats(self, model_dir: Path) -> ModelStatsFull:
140257
and c.challenge == config.challenge
141258
), f"Configs differ (excluding seed): {c} vs {config}"
142259

260+
# Validate that all strategy manifests are identical
261+
if strategy is not None:
262+
for s in strategies[1:]:
263+
assert (
264+
s.name == strategy.name
265+
and s.description == strategy.description
266+
and s.author == strategy.author
267+
and s.version == strategy.version
268+
and s.tags == strategy.tags
269+
), f"Strategy manifests differ: {s} vs {strategy}"
270+
143271
avg_final_round = sum(stat.final_round for stat in stats) / len(stats)
144272
std_final_round = statistics.stdev(stat.final_round for stat in stats)
145273

@@ -216,6 +344,7 @@ def tot_std_dev(attr: str) -> float:
216344
std_dev=std_dev,
217345
stats=stats,
218346
config=config,
347+
strategy=strategy,
219348
)
220349

221350
def compute_models_leaderboard(
@@ -401,3 +530,46 @@ def create_detailed_run_dirs(self, model_dir: Path, output_dir: Path) -> None:
401530
if png_file.exists():
402531
screenshot_dest = custom_id_dir / "screenshot.png"
403532
screenshot_dest.write_bytes(png_file.read_bytes())
533+
534+
def generate_manifest(self, base_dir: Path, current_version: str) -> None:
535+
"""Generate manifest.json tracking available benchmark versions.
536+
537+
Scans base_dir for all version directories (v*.*.* pattern),
538+
sorts them semantically in descending order, and marks the
539+
current version as latest.
540+
541+
Args:
542+
base_dir: Base directory containing version subdirectories
543+
(e.g., benchmarks/models or benchmarks/strategies)
544+
current_version: Current version string from __version__
545+
"""
546+
if not base_dir.exists():
547+
base_dir.mkdir(parents=True, exist_ok=True)
548+
549+
# Find all version directories
550+
versions = []
551+
version_pattern = re.compile(r"^v(\d+)\.(\d+)\.(\d+)$")
552+
553+
for item in base_dir.iterdir():
554+
if item.is_dir():
555+
match = version_pattern.match(item.name)
556+
if match:
557+
major, minor, patch = map(int, match.groups())
558+
versions.append((item.name, (major, minor, patch)))
559+
560+
# Sort versions in descending order (newest first)
561+
versions.sort(key=lambda x: x[1], reverse=True)
562+
563+
# Create manifest entries
564+
manifest_entries = []
565+
for version_str, _ in versions:
566+
entry = {"version": version_str}
567+
if version_str == f"v{current_version}":
568+
entry["latest"] = True
569+
manifest_entries.append(entry)
570+
571+
# Write manifest.json
572+
manifest = {"versions": manifest_entries}
573+
manifest_path = base_dir / "manifest.json"
574+
with open(manifest_path, "w") as f:
575+
json.dump(manifest, f, indent=2)

0 commit comments

Comments
 (0)