Skip to content

Commit 4520a71

Browse files
S1M0N38claude
andcommitted
feat: update v0.7.0 benchmark results
Update current benchmark data with latest results: - Enhanced leaderboard with updated metrics - Improved openai model performance data (gpt-oss-120b, gpt-oss-20b) - Updated qwen model results (qwen3-235b-a22b-thinking-2507) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent e943200 commit 4520a71

4 files changed

Lines changed: 439 additions & 55 deletions

File tree

data/benchmarks/v0.7.0/default/leaderboard.json

Lines changed: 220 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"generated_at": "2025-09-04T10:11:00.427201",
2+
"generated_at": "2025-09-04T10:52:59.779560",
33
"version": "0.7.0",
44
"strategy": "default",
55
"total_entries": 3,
@@ -34,26 +34,90 @@
3434
"avg_successful_calls": 29,
3535
"avg_invalid_responses": 0,
3636
"avg_failed_calls": 2,
37-
"avg_avg_input_tokens": 2749.81,
38-
"avg_avg_output_tokens": 2643.91,
39-
"avg_avg_reasoning_tokens": 2507.57,
40-
"avg_avg_total_tokens": 5393.72,
41-
"avg_avg_response_time_ms": 2319.17,
37+
"avg_avg_input_tokens": 2749.8082905982906,
38+
"avg_avg_output_tokens": 2643.9128205128204,
39+
"avg_avg_reasoning_tokens": 2507.5697435897437,
40+
"avg_avg_total_tokens": 5393.721111111111,
41+
"avg_avg_response_time_ms": 2319.1694525082758,
4242
"avg_total_input_tokens": 81573.4,
4343
"avg_total_output_tokens": 78361.6,
4444
"avg_total_reasoning_tokens": 74651.2,
4545
"avg_total_tokens": 159935,
4646
"avg_total_response_time_ms": 62944,
47-
"avg_total_cost": 0.26,
48-
"avg_avg_cost_per_call": 0.01,
47+
"avg_total_cost": 0.25955682,
48+
"avg_avg_cost_per_call": 0.008756680948717949,
4949
"avg_total_upstream_inference_cost": 0,
50-
"avg_total_upstream_prompt_cost": 0.02,
51-
"avg_total_upstream_completion_cost": 0.24,
50+
"avg_total_upstream_prompt_cost": 0.02447202,
51+
"avg_total_upstream_completion_cost": 0.23508479999999998,
5252
"avg_providers_used_count": 1,
5353
"avg_reasoning_calls": 0,
5454
"avg_avg_reasoning_content_length": 0.0,
5555
"avg_total_reasoning_content_length": 0,
5656
"avg_request_ids_count": 29
57+
},
58+
"total_stats": {
59+
"total_final_round": 17,
60+
"total_ante_reached": 8,
61+
"total_jokers_bought": 0,
62+
"total_jokers_sold": 2,
63+
"total_consumables_used": 3,
64+
"total_rerolls": 3,
65+
"total_money_spent": 64,
66+
"total_successful_calls": 145,
67+
"total_invalid_responses": 0,
68+
"total_failed_calls": 10,
69+
"total_avg_input_tokens": 13749.041452991452,
70+
"total_avg_output_tokens": 13219.564102564102,
71+
"total_avg_reasoning_tokens": 12537.848717948718,
72+
"total_avg_total_tokens": 26968.605555555554,
73+
"total_avg_response_time_ms": 11595.84726254138,
74+
"total_total_input_tokens": 407867,
75+
"total_total_output_tokens": 391808,
76+
"total_total_reasoning_tokens": 373256,
77+
"total_total_tokens": 799675,
78+
"total_total_response_time_ms": 314720,
79+
"total_total_cost": 1.2977840999999999,
80+
"total_avg_cost_per_call": 0.04378340474358974,
81+
"total_total_upstream_inference_cost": 0,
82+
"total_total_upstream_prompt_cost": 0.1223601,
83+
"total_total_upstream_completion_cost": 1.175424,
84+
"total_providers_used_count": 5,
85+
"total_reasoning_calls": 0,
86+
"total_avg_reasoning_content_length": 0.0,
87+
"total_total_reasoning_content_length": 0,
88+
"total_request_ids_count": 145
89+
},
90+
"standard_deviation_stats": {
91+
"std_final_round": 1.3416407864998738,
92+
"std_ante_reached": 0.5477225575051661,
93+
"std_jokers_bought": 0.0,
94+
"std_jokers_sold": 0.5477225575051661,
95+
"std_consumables_used": 0.5477225575051661,
96+
"std_rerolls": 0.8944271909999159,
97+
"std_money_spent": 8.78635305459552,
98+
"std_successful_calls": 15.411035007422441,
99+
"std_invalid_responses": 0.0,
100+
"std_failed_calls": 1.8708286933869707,
101+
"std_avg_input_tokens": 153.45269181754392,
102+
"std_avg_output_tokens": 284.26107070041485,
103+
"std_avg_reasoning_tokens": 335.4759133336672,
104+
"std_avg_total_tokens": 365.4445832265616,
105+
"std_avg_response_time_ms": 239.89051516121458,
106+
"std_total_input_tokens": 46276.57090040272,
107+
"std_total_output_tokens": 43995.828328376774,
108+
"std_total_reasoning_tokens": 42348.967209366514,
109+
"std_total_tokens": 90077.34098262448,
110+
"std_total_response_time_ms": 31281.6000549844,
111+
"std_total_cost": 0.14576189205171905,
112+
"std_avg_cost_per_call": 0.0008692734869068055,
113+
"std_total_upstream_inference_cost": 0.0,
114+
"std_total_upstream_prompt_cost": 0.013882971270120819,
115+
"std_total_upstream_completion_cost": 0.1319874849851303,
116+
"std_providers_used_count": 0.0,
117+
"std_reasoning_calls": 0.0,
118+
"std_avg_reasoning_content_length": 0.0,
119+
"std_total_reasoning_content_length": 0.0,
120+
"std_request_ids_count": 15.411035007422441
57121
}
58122
},
59123
{
@@ -85,26 +149,90 @@
85149
"avg_successful_calls": 24.75,
86150
"avg_invalid_responses": 0.5,
87151
"avg_failed_calls": 0.25,
88-
"avg_avg_input_tokens": 2210.44,
89-
"avg_avg_output_tokens": 610.14,
90-
"avg_avg_reasoning_tokens": 484.28,
91-
"avg_avg_total_tokens": 2820.58,
92-
"avg_avg_response_time_ms": 2566.22,
152+
"avg_avg_input_tokens": 2210.4400501672244,
153+
"avg_avg_output_tokens": 610.1413712374582,
154+
"avg_avg_reasoning_tokens": 484.2821571906354,
155+
"avg_avg_total_tokens": 2820.5814214046823,
156+
"avg_avg_response_time_ms": 2566.2245652173915,
93157
"avg_total_input_tokens": 54700.5,
94158
"avg_total_output_tokens": 15144,
95159
"avg_total_reasoning_tokens": 12037,
96160
"avg_total_tokens": 69844.5,
97161
"avg_total_response_time_ms": 64102,
98-
"avg_total_cost": 0.01,
99-
"avg_avg_cost_per_call": 0.0,
162+
"avg_total_cost": 0.011737845,
163+
"avg_avg_cost_per_call": 0.00047350322157190634,
100164
"avg_total_upstream_inference_cost": 0,
101-
"avg_total_upstream_prompt_cost": 0.0,
102-
"avg_total_upstream_completion_cost": 0.01,
165+
"avg_total_upstream_prompt_cost": 0.004923045,
166+
"avg_total_upstream_completion_cost": 0.0068148,
103167
"avg_providers_used_count": 1,
104168
"avg_reasoning_calls": 0,
105169
"avg_avg_reasoning_content_length": 0.0,
106170
"avg_total_reasoning_content_length": 0,
107171
"avg_request_ids_count": 24.75
172+
},
173+
"total_stats": {
174+
"total_final_round": 12,
175+
"total_ante_reached": 8,
176+
"total_jokers_bought": 0,
177+
"total_jokers_sold": 1,
178+
"total_consumables_used": 3,
179+
"total_rerolls": 0,
180+
"total_money_spent": 40,
181+
"total_successful_calls": 99,
182+
"total_invalid_responses": 2,
183+
"total_failed_calls": 1,
184+
"total_avg_input_tokens": 8841.760200668898,
185+
"total_avg_output_tokens": 2440.565484949833,
186+
"total_avg_reasoning_tokens": 1937.1286287625417,
187+
"total_avg_total_tokens": 11282.325685618729,
188+
"total_avg_response_time_ms": 10264.898260869566,
189+
"total_total_input_tokens": 218802,
190+
"total_total_output_tokens": 60576,
191+
"total_total_reasoning_tokens": 48148,
192+
"total_total_tokens": 279378,
193+
"total_total_response_time_ms": 256408,
194+
"total_total_cost": 0.04695138,
195+
"total_avg_cost_per_call": 0.0018940128862876254,
196+
"total_total_upstream_inference_cost": 0,
197+
"total_total_upstream_prompt_cost": 0.01969218,
198+
"total_total_upstream_completion_cost": 0.0272592,
199+
"total_providers_used_count": 4,
200+
"total_reasoning_calls": 0,
201+
"total_avg_reasoning_content_length": 0.0,
202+
"total_total_reasoning_content_length": 0,
203+
"total_request_ids_count": 99
204+
},
205+
"standard_deviation_stats": {
206+
"std_final_round": 0.0,
207+
"std_ante_reached": 0.0,
208+
"std_jokers_bought": 0.0,
209+
"std_jokers_sold": 0.5,
210+
"std_consumables_used": 0.5,
211+
"std_rerolls": 0.0,
212+
"std_money_spent": 2.0,
213+
"std_successful_calls": 1.2583057392117916,
214+
"std_invalid_responses": 0.5773502691896257,
215+
"std_failed_calls": 0.5,
216+
"std_avg_input_tokens": 20.975247503041835,
217+
"std_avg_output_tokens": 81.87974903242305,
218+
"std_avg_reasoning_tokens": 93.71423840196584,
219+
"std_avg_total_tokens": 72.01742244342569,
220+
"std_avg_response_time_ms": 162.13121496673895,
221+
"std_total_input_tokens": 2610.4281258061865,
222+
"std_total_output_tokens": 2573.217052640527,
223+
"std_total_reasoning_tokens": 2774.0779849648543,
224+
"std_total_tokens": 4729.463147828374,
225+
"std_total_response_time_ms": 4675.062994227992,
226+
"std_total_cost": 0.0013258154379475295,
227+
"std_avg_cost_per_call": 3.580355287107077e-05,
228+
"std_total_upstream_inference_cost": 0.0,
229+
"std_total_upstream_prompt_cost": 0.00023493853132255688,
230+
"std_total_upstream_completion_cost": 0.001157947673688237,
231+
"std_providers_used_count": 0.0,
232+
"std_reasoning_calls": 0.0,
233+
"std_avg_reasoning_content_length": 0.0,
234+
"std_total_reasoning_content_length": 0.0,
235+
"std_request_ids_count": 1.2583057392117916
108236
}
109237
},
110238
{
@@ -136,26 +264,90 @@
136264
"avg_successful_calls": 12,
137265
"avg_invalid_responses": 2,
138266
"avg_failed_calls": 0,
139-
"avg_avg_input_tokens": 2284.9,
140-
"avg_avg_output_tokens": 725.17,
141-
"avg_avg_reasoning_tokens": 605.28,
142-
"avg_avg_total_tokens": 3010.07,
143-
"avg_avg_response_time_ms": 2795.64,
267+
"avg_avg_input_tokens": 2284.8961449077237,
268+
"avg_avg_output_tokens": 725.1720984278879,
269+
"avg_avg_reasoning_tokens": 605.2846753246754,
270+
"avg_avg_total_tokens": 3010.068243335612,
271+
"avg_avg_response_time_ms": 2795.636274509804,
144272
"avg_total_input_tokens": 27975.2,
145273
"avg_total_output_tokens": 9010.4,
146274
"avg_total_reasoning_tokens": 7446,
147275
"avg_total_tokens": 36985.6,
148276
"avg_total_response_time_ms": 29405.6,
149-
"avg_total_cost": 0.0,
150-
"avg_avg_cost_per_call": 0.0,
277+
"avg_total_cost": 0.002571264,
278+
"avg_avg_cost_per_call": 0.00020838629063568012,
151279
"avg_total_upstream_inference_cost": 0,
152-
"avg_total_upstream_prompt_cost": 0.0,
153-
"avg_total_upstream_completion_cost": 0.0,
280+
"avg_total_upstream_prompt_cost": 0.001122664,
281+
"avg_total_upstream_completion_cost": 0.0014486,
154282
"avg_providers_used_count": 1.2,
155283
"avg_reasoning_calls": 0,
156284
"avg_avg_reasoning_content_length": 0.0,
157285
"avg_total_reasoning_content_length": 0,
158286
"avg_request_ids_count": 12
287+
},
288+
"total_stats": {
289+
"total_final_round": 10,
290+
"total_ante_reached": 6,
291+
"total_jokers_bought": 0,
292+
"total_jokers_sold": 0,
293+
"total_consumables_used": 2,
294+
"total_rerolls": 0,
295+
"total_money_spent": 6,
296+
"total_successful_calls": 60,
297+
"total_invalid_responses": 10,
298+
"total_failed_calls": 0,
299+
"total_avg_input_tokens": 11424.48072453862,
300+
"total_avg_output_tokens": 3625.8604921394394,
301+
"total_avg_reasoning_tokens": 3026.4233766233765,
302+
"total_avg_total_tokens": 15050.341216678058,
303+
"total_avg_response_time_ms": 13978.18137254902,
304+
"total_total_input_tokens": 139876,
305+
"total_total_output_tokens": 45052,
306+
"total_total_reasoning_tokens": 37230,
307+
"total_total_tokens": 184928,
308+
"total_total_response_time_ms": 147028,
309+
"total_total_cost": 0.012856320000000001,
310+
"total_avg_cost_per_call": 0.0010419314531784005,
311+
"total_total_upstream_inference_cost": 0,
312+
"total_total_upstream_prompt_cost": 0.00561332,
313+
"total_total_upstream_completion_cost": 0.0072429999999999994,
314+
"total_providers_used_count": 6,
315+
"total_reasoning_calls": 0,
316+
"total_avg_reasoning_content_length": 0.0,
317+
"total_total_reasoning_content_length": 0,
318+
"total_request_ids_count": 60
319+
},
320+
"standard_deviation_stats": {
321+
"std_final_round": 0.7071067811865476,
322+
"std_ante_reached": 0.4472135954999579,
323+
"std_jokers_bought": 0.0,
324+
"std_jokers_sold": 0.0,
325+
"std_consumables_used": 0.5477225575051661,
326+
"std_rerolls": 0.0,
327+
"std_money_spent": 1.6431676725154984,
328+
"std_successful_calls": 5.0990195135927845,
329+
"std_invalid_responses": 1.224744871391589,
330+
"std_failed_calls": 0.0,
331+
"std_avg_input_tokens": 166.78082561089806,
332+
"std_avg_output_tokens": 127.105143954031,
333+
"std_avg_reasoning_tokens": 100.52996231466867,
334+
"std_avg_total_tokens": 230.43803840595854,
335+
"std_avg_response_time_ms": 269.5746424833945,
336+
"std_total_input_tokens": 13225.179684979708,
337+
"std_total_output_tokens": 4594.992252441782,
338+
"std_total_reasoning_tokens": 3710.673186902883,
339+
"std_total_tokens": 17652.933164208152,
340+
"std_total_response_time_ms": 13349.346512095639,
341+
"std_total_cost": 0.0012465670525406967,
342+
"std_avg_cost_per_call": 2.2920700958973334e-05,
343+
"std_total_upstream_inference_cost": 0.0,
344+
"std_total_upstream_prompt_cost": 0.0005285137149403031,
345+
"std_total_upstream_completion_cost": 0.0007330546402554177,
346+
"std_providers_used_count": 0.4472135954999579,
347+
"std_reasoning_calls": 0.0,
348+
"std_avg_reasoning_content_length": 0.0,
349+
"std_total_reasoning_content_length": 0.0,
350+
"std_request_ids_count": 5.0990195135927845
159351
}
160352
}
161353
]

0 commit comments

Comments
 (0)