@@ -132,6 +132,18 @@ <h1 class="text-2xl sm:text-4xl font-bold text-white mb-4">
132132 </ svg > (s)
133133 </ div >
134134 </ th >
135+ < th
136+ class ="px-2 py-3 text-center text-xs sm:text-sm font-medium text-gray-300 w-32 sm:w-36 hidden xl:table-cell ">
137+ < div class ="flex items-center justify-center gap-1 "> Cost per 1K < svg class ="w-4 h-4 "
138+ xmlns ="http://www.w3.org/2000/svg " viewBox ="0 0 16 16 " fill ="currentColor " className ="size-4 ">
139+ < path fillRule ="evenodd " class ="w-4 h-4 "
140+ d ="M15 4.5A3.5 3.5 0 0 1 11.435 8c-.99-.019-2.093.132-2.7.913l-4.13 5.31a2.015 2.015 0 1 1-2.827-2.828l5.309-4.13c.78-.607.932-1.71.914-2.7L8 4.5a3.5 3.5 0 0 1 4.477-3.362c.325.094.39.497.15.736L10.6 3.902a.48.48 0 0 0-.033.653c.271.314.565.608.879.879a.48.48 0 0 0 .653-.033l2.027-2.027c.239-.24.642-.175.736.15.09.31.138.637.138.976ZM3.75 13a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0Z "
141+ clipRule ="evenodd " />
142+ < path
143+ d ="M11.5 9.5c.313 0 .62-.029.917-.084l1.962 1.962a2.121 2.121 0 0 1-3 3l-2.81-2.81 1.35-1.734c.05-.064.158-.158.426-.233.278-.078.639-.11 1.062-.102l.093.001ZM5 4l1.446 1.445a2.256 2.256 0 0 1-.047.21c-.075.268-.169.377-.233.427l-.61.474L4 5H2.655a.25.25 0 0 1-.224-.139l-1.35-2.7a.25.25 0 0 1 .047-.289l.745-.745a.25.25 0 0 1 .289-.047l2.7 1.35A.25.25 0 0 1 5 2.654V4Z " />
144+ </ svg > ($)
145+ </ div >
146+ </ th >
135147 </ tr >
136148 </ thead >
137149 < tbody id ="leaderboard-body " class ="divide-y divide-gray-700 ">
@@ -142,27 +154,51 @@ <h1 class="text-2xl sm:text-4xl font-bold text-white mb-4">
142154 </ div >
143155
144156
145- <!-- Methodology -->
157+ <!-- Leaderboard Columns Explained -->
146158 < div class ="bg-gray-800 rounded-lg p-4 sm:p-6 lg:p-8 border border-gray-700 ">
147- < h2 class ="text-xl sm:text-2xl font-bold mb-4 "> Methodology</ h2 >
148- < div class ="grid sm:grid-cols-1 md:grid-cols-2 gap-4 sm:gap-6 text-gray-300 ">
149- < div >
150- < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Game Parameters</ h3 >
151- < ul class ="space-y-1 text-sm sm:text-base ">
152- < li > • Balatro v1.0.1n</ li >
153- < li > • 100 consistent seeds</ li >
154- < li > • Standard deck configuration</ li >
155- < li > • No modifications or cheats</ li >
156- </ ul >
159+ < h2 class ="text-xl sm:text-2xl font-bold mb-4 "> Leaderboard Columns Explained</ h2 >
160+ < div class ="grid sm:grid-cols-1 lg:grid-cols-2 gap-4 sm:gap-6 text-gray-300 ">
161+ < div class ="space-y-4 ">
162+ < div >
163+ < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Ranking & Model Info</ h3 >
164+ < div class ="space-y-2 text-sm sm:text-base ">
165+ < div > < strong class ="text-blue-400 "> Rank:</ strong > Sorted by average rounds reached (highest first)</ div >
166+ < div > < strong class ="text-blue-400 "> Model:</ strong > OpenRouter model names with creator-suggested
167+ parameters for open-source models, defaults for closed-source</ div >
168+ < div > < strong class ="text-blue-400 "> Provider:</ strong > Model developer/organization</ div >
169+ </ div >
170+ </ div >
171+ < div >
172+ < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Performance Metrics</ h3 >
173+ < div class ="space-y-2 text-sm sm:text-base ">
174+ < div > < strong class ="text-blue-400 "> Rounds:</ strong > Average rounds reached across multiple games</ div >
175+ < div > < strong class ="text-blue-400 "> Completed:</ strong > Success rate for round completion (rounds stopped
176+ at 3 consecutive errors/failed calls)</ div >
177+ </ div >
178+ </ div >
157179 </ div >
158- < div >
159- < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Evaluation Criteria</ h3 >
160- < ul class ="space-y-1 text-sm sm:text-base ">
161- < li > • Average ante reached</ li >
162- < li > • Win rate across seeds</ li >
163- < li > • Token efficiency</ li >
164- < li > • Decision quality scoring</ li >
165- </ ul >
180+ < div class ="space-y-4 ">
181+ < div >
182+ < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Call Quality Breakdown</ h3 >
183+ < div class ="space-y-2 text-sm sm:text-base ">
184+ < div > < strong class ="text-green-400 "> Success:</ strong > Valid tool calls that execute successfully in game
185+ state</ div >
186+ < div > < strong class ="text-red-400 "> Error:</ strong > Invalid responses (no tool call, JSON errors, plain text
187+ responses)</ div >
188+ < div > < strong class ="text-yellow-400 "> Failed:</ strong > Valid tool calls that can't execute (e.g.,
189+ discarding 6 cards when limit is 5)</ div >
190+ </ div >
191+ </ div >
192+ < div >
193+ < h3 class ="font-semibold text-white mb-2 text-base sm:text-lg "> Efficiency Metrics</ h3 >
194+ < div class ="space-y-2 text-sm sm:text-base ">
195+ < div > < strong class ="text-blue-400 "> Input/Output Tokens:</ strong > Token counts per tool call (including
196+ reasoning and tool call tokens)</ div >
197+ < div > < strong class ="text-blue-400 "> Time per Call:</ strong > Average LLM response generation time</ div >
198+ < div > < strong class ="text-blue-400 "> Cost per 1K Calls:</ strong > Pricing based on cheapest OpenRouter option
199+ (scaled to 1K for visual comparison)</ div >
200+ </ div >
201+ </ div >
166202 </ div >
167203 </ div >
168204 </ div >
0 commit comments