Skip to content

Commit bdc9551

Browse files
committed
feat(balatrobench): update strategies to match model format
1 parent a022722 commit bdc9551

4 files changed

Lines changed: 54 additions & 5 deletions

File tree

src/balatrobench/analyzer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,14 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
125125

126126
# Load strategy (once)
127127
if strategy_obj is None:
128+
strategy_key = source_task["strategy"]
129+
128130
if strategy_file.exists():
129131
with strategy_file.open() as f:
130132
source_strategy: SourceStrategy = json.load(f)
131133
strategy_obj = Strategy(
132134
name=source_strategy["name"],
135+
key=strategy_key,
133136
description=source_strategy["description"],
134137
author=source_strategy["author"],
135138
version=source_strategy["version"],
@@ -138,6 +141,7 @@ def _compute_runs(self, model_dir: Path) -> Runs | None:
138141
else:
139142
strategy_obj = Strategy(
140143
name=source_task["strategy"],
144+
key=strategy_key,
141145
description="",
142146
author="",
143147
version="",

src/balatrobench/cli.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,15 @@ def main() -> None:
100100

101101
# Get strategy from first Runs
102102
strategy = runs_list[0].strategy
103+
strategy_key = strategy.key
103104

104105
# Write leaderboard
105106
leaderboard = analyzer.create_models_leaderboard(strategy, runs_list)
106-
models_writer.write_models_leaderboard(leaderboard, version, strategy_name)
107+
models_writer.write_models_leaderboard(leaderboard, version, strategy_key)
107108

108109
# Write model runs and request files
109110
for runs in runs_list:
110-
models_writer.write_runs(runs, version, strategy_name)
111+
models_writer.write_runs(runs, version, strategy_key)
111112

112113
# Write per-request files for each run
113114
for run in runs.runs:
@@ -122,7 +123,7 @@ def main() -> None:
122123
output_base = (
123124
models_output_dir
124125
/ version
125-
/ strategy_name
126+
/ strategy_key
126127
/ runs.model.vendor
127128
/ runs.model.name
128129
)
@@ -150,10 +151,20 @@ def main() -> None:
150151
leaderboard, version, model_key
151152
)
152153

153-
# Write strategy runs
154+
# Write strategy runs and request files
154155
for runs in runs_list:
155156
strategies_writer.write_strategy_runs(runs, version, vendor, model_name)
156157

158+
# Write per-request files for each run
159+
strategy_key = runs.strategy.key
160+
for run in runs.runs:
161+
# Find run directory in input
162+
run_dir = input_dir / strategy_key / vendor / model_name / run.id
163+
if run_dir.exists():
164+
strategies_writer.write_strategy_request_files(
165+
run_dir, version, vendor, model_name, strategy_key, run.id
166+
)
167+
157168
# Convert PNGs to WebP if enabled
158169
if args.webp:
159170
print("\nConverting PNG screenshots to WebP format...")

src/balatrobench/models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ class Strategy:
8484
"""Strategy metadata."""
8585

8686
name: str
87+
key: str
8788
description: str
8889
author: str
8990
version: str

src/balatrobench/writer.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,18 +108,38 @@ def write_strategy_runs(
108108
) -> Path:
109109
"""Write runs.json for a strategy (when analyzing strategies per model).
110110
111+
Output: {version}/{vendor}/{model}/{strategy_key}/runs.json
112+
111113
Returns the path to the written file.
112114
"""
113115
output_path = (
114116
self.output_dir
115117
/ version
116118
/ vendor
117119
/ model_name
118-
/ runs.strategy.name
120+
/ runs.strategy.key
119121
/ "runs.json"
120122
)
121123
return self._write_json(output_path, runs)
122124

125+
def write_strategy_request_files(
126+
self,
127+
run_dir: Path,
128+
version: str,
129+
vendor: str,
130+
model_name: str,
131+
strategy_key: str,
132+
run_id: str,
133+
) -> None:
134+
"""Extract and write per-request files for a strategy run.
135+
136+
Output: {version}/{vendor}/{model}/{strategy_key}/{run_id}/{request_id}/
137+
Each containing: reasoning.md, tool_call.json, strategy.md, gamestate.md,
138+
memory.md, metadata.json, and screenshot.png (if available).
139+
"""
140+
output_base = self.output_dir / version / vendor / model_name / strategy_key
141+
self._write_request_files_impl(run_dir, output_base)
142+
123143
def write_request_files(
124144
self,
125145
run_dir: Path,
@@ -131,6 +151,19 @@ def write_request_files(
131151
Each containing: reasoning.md, tool_call.json, strategy.md, gamestate.md,
132152
memory.md, metadata.json, and screenshot.webp (if available).
133153
"""
154+
self._write_request_files_impl(run_dir, output_base)
155+
156+
def _write_request_files_impl(
157+
self,
158+
run_dir: Path,
159+
output_base: Path,
160+
) -> None:
161+
"""Internal implementation for writing per-request files.
162+
163+
Creates directories like: {output_base}/{run_id}/{request_id}/
164+
Each containing: reasoning.md, tool_call.json, strategy.md, gamestate.md,
165+
memory.md, metadata.json, and screenshot.png (if available).
166+
"""
134167
run_id = run_dir.name
135168
requests_file = run_dir / "requests.jsonl"
136169
responses_file = run_dir / "responses.jsonl"

0 commit comments

Comments
 (0)