Skip to content

Commit ccc3a97

Browse files
committed
feat(writer): add file I/O and WebP conversion for benchmark output
1 parent abd56fa commit ccc3a97

1 file changed

Lines changed: 260 additions & 0 deletions

File tree

src/balatrobench/writer.py

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
"""File I/O for BalatroBench output."""
2+
3+
import json
4+
import subprocess
5+
from concurrent.futures import ThreadPoolExecutor
6+
from dataclasses import asdict
7+
from pathlib import Path
8+
from typing import Any
9+
10+
from tqdm import tqdm
11+
12+
from .extractor import (
13+
extract_request_content,
14+
extract_request_metadata,
15+
extract_response_data,
16+
)
17+
from .models import (
18+
Manifest,
19+
ModelsLeaderboard,
20+
Runs,
21+
StrategiesLeaderboard,
22+
Version,
23+
)
24+
25+
26+
class BenchmarkWriter:
27+
"""Writes benchmark data to files."""
28+
29+
def __init__(self, output_dir: Path) -> None:
30+
self.output_dir = output_dir
31+
32+
def write_manifest(self, versions: list[str], latest_version: str) -> Path:
33+
"""Write manifest.json to base directory.
34+
35+
Args:
36+
versions: List of version strings (e.g., ["v1.0.0", "v0.16.0"])
37+
latest_version: The version to mark as latest
38+
39+
Returns the path to the written file.
40+
"""
41+
version_entries = []
42+
for v in versions:
43+
entry = Version(version=v, latest=(v == latest_version))
44+
version_entries.append(entry)
45+
46+
manifest = Manifest(versions=tuple(version_entries))
47+
48+
manifest_path = self.output_dir / "manifest.json"
49+
manifest_path.parent.mkdir(parents=True, exist_ok=True)
50+
51+
with manifest_path.open("w") as f:
52+
json.dump(self._to_dict(manifest), f, indent=2)
53+
54+
return manifest_path
55+
56+
def write_models_leaderboard(
57+
self, leaderboard: ModelsLeaderboard, version: str, strategy: str
58+
) -> Path:
59+
"""Write models leaderboard.json for a strategy.
60+
61+
Output: {version}/{strategy}/leaderboard.json
62+
63+
Returns the path to the written file.
64+
"""
65+
output_path = self.output_dir / version / strategy / "leaderboard.json"
66+
output_path.parent.mkdir(parents=True, exist_ok=True)
67+
68+
with output_path.open("w") as f:
69+
json.dump(self._to_dict(leaderboard), f, indent=2)
70+
71+
return output_path
72+
73+
def write_strategies_leaderboard(
74+
self, leaderboard: StrategiesLeaderboard, version: str, model_key: str
75+
) -> Path:
76+
"""Write strategies leaderboard.json for a model.
77+
78+
Output: {version}/{vendor}/{model}/leaderboard.json
79+
80+
Returns the path to the written file.
81+
"""
82+
output_path = self.output_dir / version / model_key / "leaderboard.json"
83+
output_path.parent.mkdir(parents=True, exist_ok=True)
84+
85+
with output_path.open("w") as f:
86+
json.dump(self._to_dict(leaderboard), f, indent=2)
87+
88+
return output_path
89+
90+
def write_runs(self, runs: Runs, version: str, strategy: str) -> Path:
91+
"""Write {model}.json for a model.
92+
93+
Returns the path to the written file.
94+
"""
95+
output_path = (
96+
self.output_dir
97+
/ version
98+
/ strategy
99+
/ runs.model.vendor
100+
/ f"{runs.model.name}.json"
101+
)
102+
output_path.parent.mkdir(parents=True, exist_ok=True)
103+
104+
with output_path.open("w") as f:
105+
json.dump(self._to_dict(runs), f, indent=2)
106+
107+
return output_path
108+
109+
def write_strategy_runs(
110+
self, runs: Runs, version: str, vendor: str, model_name: str
111+
) -> Path:
112+
"""Write runs.json for a strategy (when analyzing strategies per model).
113+
114+
Returns the path to the written file.
115+
"""
116+
output_path = (
117+
self.output_dir
118+
/ version
119+
/ vendor
120+
/ model_name
121+
/ runs.strategy.name
122+
/ "runs.json"
123+
)
124+
output_path.parent.mkdir(parents=True, exist_ok=True)
125+
126+
with output_path.open("w") as f:
127+
json.dump(self._to_dict(runs), f, indent=2)
128+
129+
return output_path
130+
131+
def write_request_files(
132+
self,
133+
run_dir: Path,
134+
output_base: Path,
135+
) -> None:
136+
"""Extract and write per-request files from a run directory.
137+
138+
Creates directories like: {output_base}/{run_id}/{request_id}/
139+
Each containing: reasoning.md, tool_call.json, strategy.md, gamestate.md,
140+
memory.md, metadata.json, and screenshot.webp (if available).
141+
"""
142+
run_id = run_dir.name
143+
requests_file = run_dir / "requests.jsonl"
144+
responses_file = run_dir / "responses.jsonl"
145+
screenshots_dir = run_dir / "screenshots"
146+
147+
# Extract data
148+
request_content = extract_request_content(requests_file)
149+
response_data = extract_response_data(responses_file)
150+
requests_by_id = extract_request_metadata(responses_file)
151+
152+
all_custom_ids = set(request_content.keys()) | set(response_data.keys())
153+
154+
for custom_id in all_custom_ids:
155+
# Convert "request-00042" to "00042"
156+
request_id = custom_id.replace("request-", "")
157+
request_dir = output_base / run_id / request_id
158+
request_dir.mkdir(parents=True, exist_ok=True)
159+
160+
# Write request content
161+
if custom_id in request_content:
162+
content = request_content[custom_id]
163+
(request_dir / "strategy.md").write_text(content["strategy"])
164+
(request_dir / "gamestate.md").write_text(content["gamestate"])
165+
(request_dir / "memory.md").write_text(content["memory"])
166+
167+
# Write response data
168+
if custom_id in response_data:
169+
data = response_data[custom_id]
170+
(request_dir / "reasoning.md").write_text(data["reasoning"])
171+
172+
# Strip reasoning from tool_call arguments before writing
173+
tool_calls = data["tool_call"]
174+
cleaned_tool_calls = self._strip_reasoning_from_tool_calls(tool_calls)
175+
with (request_dir / "tool_call.json").open("w") as f:
176+
json.dump(cleaned_tool_calls, f, indent=2)
177+
178+
# Write metadata
179+
if custom_id in requests_by_id:
180+
request = requests_by_id[custom_id]
181+
with (request_dir / "metadata.json").open("w") as f:
182+
json.dump(self._to_dict(request), f, indent=2)
183+
184+
# Copy screenshot if exists
185+
png_file = screenshots_dir / f"{custom_id}.png"
186+
if png_file.exists():
187+
(request_dir / "screenshot.png").write_bytes(png_file.read_bytes())
188+
189+
@staticmethod
190+
def _strip_reasoning_from_tool_calls(tool_calls: list[dict]) -> list[dict]:
191+
"""Remove reasoning field from tool call arguments."""
192+
result = []
193+
for tc in tool_calls:
194+
tc_copy = tc.copy()
195+
if "function" in tc_copy:
196+
func = tc_copy["function"].copy()
197+
if "arguments" in func:
198+
try:
199+
args = json.loads(func["arguments"])
200+
args.pop("reasoning", None)
201+
func["arguments"] = json.dumps(args)
202+
except (json.JSONDecodeError, TypeError):
203+
pass
204+
tc_copy["function"] = func
205+
result.append(tc_copy)
206+
return result
207+
208+
def convert_pngs_to_webp(self, directory: Path) -> None:
209+
"""Convert all screenshot.png files in directory to WebP.
210+
211+
Requires cwebp to be installed.
212+
"""
213+
try:
214+
png_files = list(directory.rglob("screenshot.png"))
215+
if not png_files:
216+
return
217+
218+
with ThreadPoolExecutor() as executor:
219+
list(
220+
tqdm(
221+
executor.map(self._convert_single_png_to_webp, png_files),
222+
total=len(png_files),
223+
desc="Converting to WebP",
224+
)
225+
)
226+
except FileNotFoundError:
227+
print("Warning: cwebp not found, keeping PNG format")
228+
except Exception as e:
229+
print(f"Warning: cwebp conversion error: {e}")
230+
231+
def _convert_single_png_to_webp(self, png_file: Path) -> None:
232+
"""Convert a single PNG file to WebP."""
233+
try:
234+
webp_file = png_file.with_suffix(".webp")
235+
subprocess.run(
236+
["cwebp", "-q", "80", "-quiet", str(png_file), "-o", str(webp_file)],
237+
capture_output=True,
238+
text=True,
239+
check=True,
240+
)
241+
png_file.unlink()
242+
except subprocess.CalledProcessError as e:
243+
print(f"Warning: cwebp conversion failed for {png_file}: {e.stderr}")
244+
except OSError as e:
245+
print(f"Warning: Could not remove {png_file}: {e}")
246+
247+
@staticmethod
248+
def _to_dict(obj: object) -> Any:
249+
"""Convert dataclass to dict, handling nested dataclasses and tuples."""
250+
if hasattr(obj, "__dataclass_fields__"):
251+
return {
252+
k: BenchmarkWriter._to_dict(v)
253+
for k, v in asdict(obj).items() # type: ignore[arg-type]
254+
}
255+
elif isinstance(obj, dict):
256+
return {k: BenchmarkWriter._to_dict(v) for k, v in obj.items()}
257+
elif isinstance(obj, (list, tuple)):
258+
return [BenchmarkWriter._to_dict(item) for item in obj]
259+
else:
260+
return obj

0 commit comments

Comments
 (0)