Skip to content

Commit 34a815e

Browse files
committed
feat: add benchmark for v0.7.0
1 parent 14dc13a commit 34a815e

5 files changed

Lines changed: 1222 additions & 1 deletion

File tree

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
{
2+
"generated_at": "2025-09-04T10:11:00.427201",
3+
"version": "0.7.0",
4+
"strategy": "default",
5+
"total_entries": 3,
6+
"total_runs_analyzed": 14,
7+
"entries": [
8+
{
9+
"rank": 1,
10+
"config": {
11+
"model": "qwen/qwen3-235b-a22b-thinking-2507",
12+
"strategy": "default",
13+
"deck": "Red Deck",
14+
"stake": 1,
15+
"seed": "OOOO155",
16+
"challenge": null,
17+
"version": "0.7.0",
18+
"name": "Unknown Name",
19+
"description": "Unknown Description",
20+
"author": "BalatroBench",
21+
"tags": []
22+
},
23+
"total_runs": 5,
24+
"completed_runs": 5,
25+
"won_runs": 0,
26+
"averaged_stats": {
27+
"avg_final_round": 3.4,
28+
"avg_ante_reached": 1.6,
29+
"avg_jokers_bought": 0,
30+
"avg_jokers_sold": 0.4,
31+
"avg_consumables_used": 0.6,
32+
"avg_rerolls": 0.6,
33+
"avg_money_spent": 12.8,
34+
"avg_successful_calls": 29,
35+
"avg_invalid_responses": 0,
36+
"avg_failed_calls": 2,
37+
"avg_avg_input_tokens": 2749.81,
38+
"avg_avg_output_tokens": 2643.91,
39+
"avg_avg_reasoning_tokens": 2507.57,
40+
"avg_avg_total_tokens": 5393.72,
41+
"avg_avg_response_time_ms": 2319.17,
42+
"avg_total_input_tokens": 81573.4,
43+
"avg_total_output_tokens": 78361.6,
44+
"avg_total_reasoning_tokens": 74651.2,
45+
"avg_total_tokens": 159935,
46+
"avg_total_response_time_ms": 62944,
47+
"avg_total_cost": 0.26,
48+
"avg_avg_cost_per_call": 0.01,
49+
"avg_total_upstream_inference_cost": 0,
50+
"avg_total_upstream_prompt_cost": 0.02,
51+
"avg_total_upstream_completion_cost": 0.24,
52+
"avg_providers_used_count": 1,
53+
"avg_reasoning_calls": 0,
54+
"avg_avg_reasoning_content_length": 0.0,
55+
"avg_total_reasoning_content_length": 0,
56+
"avg_request_ids_count": 29
57+
}
58+
},
59+
{
60+
"rank": 2,
61+
"config": {
62+
"model": "openai/gpt-oss-120b",
63+
"strategy": "default",
64+
"deck": "Red Deck",
65+
"stake": 1,
66+
"seed": "OOOO155",
67+
"challenge": null,
68+
"version": "0.7.0",
69+
"name": "Unknown Name",
70+
"description": "Unknown Description",
71+
"author": "BalatroBench",
72+
"tags": []
73+
},
74+
"total_runs": 4,
75+
"completed_runs": 4,
76+
"won_runs": 0,
77+
"averaged_stats": {
78+
"avg_final_round": 3,
79+
"avg_ante_reached": 2,
80+
"avg_jokers_bought": 0,
81+
"avg_jokers_sold": 0.25,
82+
"avg_consumables_used": 0.75,
83+
"avg_rerolls": 0,
84+
"avg_money_spent": 10,
85+
"avg_successful_calls": 24.75,
86+
"avg_invalid_responses": 0.5,
87+
"avg_failed_calls": 0.25,
88+
"avg_avg_input_tokens": 2210.44,
89+
"avg_avg_output_tokens": 610.14,
90+
"avg_avg_reasoning_tokens": 484.28,
91+
"avg_avg_total_tokens": 2820.58,
92+
"avg_avg_response_time_ms": 2566.22,
93+
"avg_total_input_tokens": 54700.5,
94+
"avg_total_output_tokens": 15144,
95+
"avg_total_reasoning_tokens": 12037,
96+
"avg_total_tokens": 69844.5,
97+
"avg_total_response_time_ms": 64102,
98+
"avg_total_cost": 0.01,
99+
"avg_avg_cost_per_call": 0.0,
100+
"avg_total_upstream_inference_cost": 0,
101+
"avg_total_upstream_prompt_cost": 0.0,
102+
"avg_total_upstream_completion_cost": 0.01,
103+
"avg_providers_used_count": 1,
104+
"avg_reasoning_calls": 0,
105+
"avg_avg_reasoning_content_length": 0.0,
106+
"avg_total_reasoning_content_length": 0,
107+
"avg_request_ids_count": 24.75
108+
}
109+
},
110+
{
111+
"rank": 3,
112+
"config": {
113+
"model": "openai/gpt-oss-20b",
114+
"strategy": "default",
115+
"deck": "Red Deck",
116+
"stake": 1,
117+
"seed": "OOOO155",
118+
"challenge": null,
119+
"version": "0.7.0",
120+
"name": "Unknown Name",
121+
"description": "Unknown Description",
122+
"author": "BalatroBench",
123+
"tags": []
124+
},
125+
"total_runs": 5,
126+
"completed_runs": 4,
127+
"won_runs": 0,
128+
"averaged_stats": {
129+
"avg_final_round": 2,
130+
"avg_ante_reached": 1.2,
131+
"avg_jokers_bought": 0,
132+
"avg_jokers_sold": 0,
133+
"avg_consumables_used": 0.4,
134+
"avg_rerolls": 0,
135+
"avg_money_spent": 1.2,
136+
"avg_successful_calls": 12,
137+
"avg_invalid_responses": 2,
138+
"avg_failed_calls": 0,
139+
"avg_avg_input_tokens": 2284.9,
140+
"avg_avg_output_tokens": 725.17,
141+
"avg_avg_reasoning_tokens": 605.28,
142+
"avg_avg_total_tokens": 3010.07,
143+
"avg_avg_response_time_ms": 2795.64,
144+
"avg_total_input_tokens": 27975.2,
145+
"avg_total_output_tokens": 9010.4,
146+
"avg_total_reasoning_tokens": 7446,
147+
"avg_total_tokens": 36985.6,
148+
"avg_total_response_time_ms": 29405.6,
149+
"avg_total_cost": 0.0,
150+
"avg_avg_cost_per_call": 0.0,
151+
"avg_total_upstream_inference_cost": 0,
152+
"avg_total_upstream_prompt_cost": 0.0,
153+
"avg_total_upstream_completion_cost": 0.0,
154+
"avg_providers_used_count": 1.2,
155+
"avg_reasoning_calls": 0,
156+
"avg_avg_reasoning_content_length": 0.0,
157+
"avg_total_reasoning_content_length": 0,
158+
"avg_request_ids_count": 12
159+
}
160+
}
161+
]
162+
}

0 commit comments

Comments
 (0)