Skip to content

Commit f412a42

Browse files
committed
feat(models): add data models for benchmark files
1 parent 5d4d7c0 commit f412a42

1 file changed

Lines changed: 261 additions & 0 deletions

File tree

src/balatrobench/models.py

Lines changed: 261 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,261 @@
1+
"""Output data models for benchmark files.
2+
3+
================================================================================
4+
Output Directory Structure
5+
================================================================================
6+
7+
site/benchmarks/
8+
9+
├── models/ # Compare MODELS (same strategy)
10+
│ ├── manifest.json → Manifest
11+
│ └── {version}/{strategy}/
12+
│ ├── leaderboard.json → ModelsLeaderboard
13+
│ └── {vendor}/
14+
│ ├── {model}.json → Runs
15+
│ └── {model}/{run}/{request}/
16+
│ └── metadata.json → Request
17+
18+
└── strategies/ # Compare STRATEGIES (same model)
19+
├── manifest.json → Manifest
20+
└── {version}/{vendor}/{model}/
21+
├── leaderboard.json → StrategiesLeaderboard
22+
└── {strategy}/
23+
├── runs.json → Runs
24+
└── {run}/{request}/
25+
└── metadata.json → Request
26+
27+
================================================================================
28+
Dataclass Hierarchy
29+
================================================================================
30+
31+
Building Blocks (reusable components):
32+
Strategy - Strategy metadata (name, description, author, version, tags)
33+
Model - Model identification (vendor, name)
34+
Config - Run configuration (seed, deck, stake)
35+
Stats - Call/token/time/cost statistics (total, avg, std)
36+
37+
Output Files:
38+
Manifest - List of available versions
39+
40+
├── ModelsLeaderboard - Ranking models for a strategy
41+
│ └── ModelsLeaderboardEntry
42+
│ ├── LeaderboardEntry - run counts, round stats, Stats
43+
│ └── model: Model
44+
45+
├── StrategiesLeaderboard - Ranking strategies for a model
46+
│ └── StrategiesLeaderboardEntry
47+
│ ├── LeaderboardEntry - run counts, round stats, Stats
48+
│ └── strategy: Strategy
49+
50+
├── Runs - Collection of benchmark runs
51+
│ └── Run - Single run with Model, Strategy, Config, Stats
52+
53+
└── Request - Single LLM API call metadata
54+
55+
"""
56+
57+
from dataclasses import dataclass
58+
from typing import Literal
59+
60+
from balatrobench.enums import Deck, Stake
61+
62+
################################################################################
63+
# Version, Manifest, Strategy, Model & Config
64+
################################################################################
65+
66+
67+
@dataclass(frozen=True)
68+
class Version:
69+
"""A version entry in manifest.json."""
70+
71+
version: str
72+
latest: bool = False
73+
74+
75+
@dataclass(frozen=True)
76+
class Manifest:
77+
"""manifest.json structure."""
78+
79+
versions: tuple[Version, ...]
80+
81+
82+
@dataclass(frozen=True)
83+
class Strategy:
84+
"""Strategy metadata."""
85+
86+
name: str
87+
description: str
88+
author: str
89+
version: str
90+
tags: tuple[str, ...]
91+
92+
93+
@dataclass(frozen=True)
94+
class Model:
95+
"""Model identification."""
96+
97+
vendor: str
98+
name: str
99+
100+
101+
@dataclass(frozen=True)
102+
class Config:
103+
"""Run configuration (Balatro game settings)."""
104+
105+
seed: str
106+
deck: Deck
107+
stake: Stake
108+
109+
110+
################################################################################
111+
# Stats
112+
################################################################################
113+
114+
115+
@dataclass(frozen=True)
116+
class Stats:
117+
"""Statistics computed for a single run or aggregated across runs."""
118+
119+
# Calls
120+
calls_total: int
121+
calls_success: int
122+
calls_error: int
123+
calls_failed: int
124+
125+
# Tokens
126+
tokens_in_total: int
127+
tokens_out_total: int
128+
tokens_in_avg: float
129+
tokens_out_avg: float
130+
tokens_in_std: float
131+
tokens_out_std: float
132+
133+
# Timing
134+
time_total_ms: int
135+
time_avg_ms: float
136+
time_std_ms: float
137+
138+
# Cost
139+
cost_total: float
140+
cost_avg: float
141+
cost_std: float
142+
143+
144+
################################################################################
145+
# Leaderboards
146+
################################################################################
147+
148+
149+
@dataclass(frozen=True)
150+
class LeaderboardEntry:
151+
"""Base stats for leaderboard entries (inherited by specific entry types)."""
152+
153+
# Run summary
154+
run_count: int
155+
run_wins: int
156+
run_completed: int
157+
158+
# Round statistics
159+
avg_round: float
160+
std_round: float
161+
162+
# Stats
163+
stats: Stats
164+
165+
166+
@dataclass(frozen=True)
167+
class ModelsLeaderboardEntry(LeaderboardEntry):
168+
"""Entry in models leaderboard - identifies a model."""
169+
170+
model: Model
171+
172+
173+
@dataclass(frozen=True)
174+
class StrategiesLeaderboardEntry(LeaderboardEntry):
175+
"""Entry in strategies leaderboard - identifies a strategy."""
176+
177+
strategy: Strategy
178+
179+
180+
@dataclass(frozen=True)
181+
class ModelsLeaderboard:
182+
"""Models leaderboard - comparing models using the same strategy."""
183+
184+
generated_at: int # Unix timestamp
185+
strategy: Strategy
186+
entries: tuple[ModelsLeaderboardEntry, ...]
187+
188+
189+
@dataclass(frozen=True)
190+
class StrategiesLeaderboard:
191+
"""Strategies leaderboard - comparing strategies for the same model."""
192+
193+
generated_at: int # Unix timestamp
194+
model: Model
195+
entries: tuple[StrategiesLeaderboardEntry, ...]
196+
197+
198+
################################################################################
199+
# Runs
200+
################################################################################
201+
202+
203+
@dataclass(frozen=True)
204+
class Run:
205+
"""Statistics for a single benchmark run."""
206+
207+
# Run identification (directory name)
208+
id: str
209+
model: Model
210+
strategy: Strategy
211+
config: Config
212+
213+
# Run outcome
214+
run_won: bool
215+
run_completed: bool
216+
217+
# Final game state
218+
final_ante: int
219+
final_round: int
220+
221+
# Provider usage distribution
222+
providers: dict[str, int] # provider_name -> call count
223+
224+
# Per-call statistics within this run
225+
stats: Stats
226+
227+
228+
@dataclass(frozen=True)
229+
class Runs:
230+
"""Collection of runs for a model+strategy combination."""
231+
232+
generated_at: int # Unix timestamp
233+
model: Model
234+
strategy: Strategy
235+
runs: tuple[Run, ...]
236+
237+
238+
################################################################################
239+
# Request
240+
################################################################################
241+
242+
243+
@dataclass(frozen=True)
244+
class Request:
245+
"""Metadata for a single LLM API request."""
246+
247+
id: str # Request identifier (e.g., "00042")
248+
status: Literal["success", "error"]
249+
provider: str # LLM provider (e.g., "openai", "azure", "groq")
250+
251+
# Token usage
252+
tokens_in: int
253+
tokens_out: int
254+
255+
# Timing
256+
time_ms: int
257+
258+
# Cost breakdown
259+
cost_in: float
260+
cost_out: float
261+
cost_total: float

0 commit comments

Comments
 (0)