Skip to content

Commit 23958c6

Browse files
committed
feat(site): use CI for error bars in the main plot
1 parent dacebcb commit 23958c6

3 files changed

Lines changed: 58 additions & 18 deletions

File tree

site/community.html

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,12 @@
102102
Author</th>
103103
<!-- Stats Section -->
104104
<th scope="col" aria-label="Average final round"
105-
class="px-4 py-3 text-center text-sm font-semibold text-zinc-600 dark:text-zinc-300 border-l-2 border-zinc-300 dark:border-zinc-600">
106-
Round</th>
105+
class="group relative cursor-help px-4 py-3 text-center text-sm font-semibold text-zinc-600 dark:text-zinc-300 border-l-2 border-zinc-300 dark:border-zinc-600">
106+
Round
107+
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
108+
Average final round reached across all runs (± std. dev.).
109+
</span>
110+
</th>
107111
<!-- Tool Calls Section -->
108112
<th scope="col" aria-label="Valid tool calls executable in state"
109113
class="group relative cursor-help px-3 py-3 text-center text-sm font-semibold text-zinc-600 dark:text-zinc-300 hidden sm:table-cell border-l-2 border-zinc-300 dark:border-zinc-600">
@@ -190,7 +194,7 @@
190194
</svg>
191195
</div>
192196
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
193-
Average input tokens per tool call.
197+
Average input tokens per tool call (± std. dev.).
194198
</span>
195199
</th>
196200
<th scope="col" aria-label="Average output tokens"
@@ -214,7 +218,7 @@
214218
</svg>
215219
</div>
216220
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
217-
Average output tokens per tool call (including reasoning tokens).
221+
Average output tokens per tool call (± std. dev., including reasoning tokens).
218222
</span>
219223
</th>
220224
<!-- Performance Section -->
@@ -237,7 +241,7 @@
237241
<span class="text-xs">[s]</span>
238242
</div>
239243
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
240-
Average time per tool call in seconds.
244+
Average time per tool call in seconds (± std. dev.).
241245
</span>
242246
</th>
243247
<th scope="col" aria-label="Average cost per tool call"
@@ -261,7 +265,7 @@
261265
<span class="text-xs">[m$]</span>
262266
</div>
263267
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
264-
Average cost per tool call in milli-dollars.
268+
Average cost per tool call in milli-dollars (± std. dev.).
265269
</span>
266270
</th>
267271
</tr>

site/index.html

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,12 @@
148148
Vendor</th>
149149
<!-- Stats Section -->
150150
<th scope="col" aria-label="Average final round"
151-
class="px-4 py-3 text-center text-sm font-semibold text-zinc-700 dark:text-zinc-300 border-l-2 border-zinc-300 dark:border-zinc-600">
152-
Round</th>
151+
class="group relative cursor-help px-4 py-3 text-center text-sm font-semibold text-zinc-700 dark:text-zinc-300 border-l-2 border-zinc-300 dark:border-zinc-600">
152+
Round
153+
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
154+
Average final round reached across all runs (± std. dev.).
155+
</span>
156+
</th>
153157
<!-- Tool Calls Section -->
154158
<th scope="col" aria-label="Valid tool calls executable in state"
155159
class="group relative cursor-help px-3 py-3 text-center text-sm font-semibold text-zinc-700 dark:text-zinc-300 hidden sm:table-cell border-l-2 border-zinc-300 dark:border-zinc-600">
@@ -236,7 +240,7 @@
236240
</svg>
237241
</div>
238242
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
239-
Average input tokens per tool call.
243+
Average input tokens per tool call (± std. dev.).
240244
</span>
241245
</th>
242246
<th scope="col" aria-label="Average output tokens"
@@ -260,7 +264,7 @@
260264
</svg>
261265
</div>
262266
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
263-
Average output tokens per tool call (including reasoning tokens).
267+
Average output tokens per tool call (± std. dev., including reasoning tokens).
264268
</span>
265269
</th>
266270
<!-- Performance Section -->
@@ -283,7 +287,7 @@
283287
<span class="text-xs">[s]</span>
284288
</div>
285289
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
286-
Average time per tool call in seconds.
290+
Average time per tool call in seconds (± std. dev.).
287291
</span>
288292
</th>
289293
<th scope="col" aria-label="Average cost per tool call"
@@ -307,7 +311,7 @@
307311
<span class="text-xs">[m$]</span>
308312
</div>
309313
<span class="pointer-events-none absolute right-full top-1/2 z-[9999] mr-2 -translate-y-1/2 whitespace-nowrap rounded-md bg-zinc-900 px-3 py-2 text-xs font-normal text-white opacity-0 shadow-md transition-opacity duration-200 group-hover:opacity-100 dark:bg-zinc-100 dark:text-zinc-900">
310-
Average cost per tool call in milli-dollars.
314+
Average cost per tool call in milli-dollars (± std. dev.).
311315
</span>
312316
</th>
313317
</tr>

site/script.js

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,28 @@ function createRoundHistogram(runs, canvasId) {
582582
});
583583
}
584584

585+
// Two-tailed 95% CI critical values: t(0.025, df)
586+
// df = n - 1, where n = run_count
587+
const T_CRIT_95 = {
588+
1: 12.706, 2: 4.303, 3: 3.182, 4: 2.776, 5: 2.571,
589+
6: 2.447, 7: 2.365, 8: 2.306, 9: 2.262, 10: 2.228,
590+
11: 2.201, 12: 2.179, 13: 2.160, 14: 2.145, 15: 2.131,
591+
16: 2.120, 17: 2.110, 18: 2.101, 19: 2.093, 20: 2.086,
592+
25: 2.060, 30: 2.042, 40: 2.021, 60: 2.000, 120: 1.980
593+
};
594+
595+
function tCritical(n) {
596+
const df = n - 1;
597+
if (df <= 0) return 0;
598+
if (T_CRIT_95[df]) return T_CRIT_95[df];
599+
// For intermediate df, find closest lower key
600+
const keys = Object.keys(T_CRIT_95).map(Number).sort((a, b) => a - b);
601+
for (let i = keys.length - 1; i >= 0; i--) {
602+
if (keys[i] <= df) return T_CRIT_95[keys[i]];
603+
}
604+
return 1.96; // z-value fallback for large n
605+
}
606+
585607
// Create performance bar chart with error bars
586608
function createPerformanceBarChart(entries) {
587609
const ctx = document.getElementById('performance-chart').getContext('2d');
@@ -612,8 +634,15 @@ function createPerformanceBarChart(entries) {
612634
fillColors.push(base);
613635
});
614636

637+
// Compute 95% CI half-widths: t(0.025, n-1) × SD / √n
638+
const ciHalfWidths = entries.map((entry, i) => {
639+
const n = entry.run_count;
640+
const t = tCritical(n);
641+
return t * stdDevs[i] / Math.sqrt(n);
642+
});
643+
615644
// Calculate Y-axis max to include error bars
616-
const maxWithError = Math.max(...avgRounds.map((avg, i) => avg + stdDevs[i]));
645+
const maxWithError = Math.max(...avgRounds.map((avg, i) => avg + ciHalfWidths[i]));
617646
// Add 0.5 padding above highest error bar, then round up to next integer for clean axis labels
618647
const yAxisMax = Math.ceil(maxWithError + 0.5);
619648

@@ -634,8 +663,8 @@ function createPerformanceBarChart(entries) {
634663
borderWidth: 0,
635664
errorBars: {
636665
'Average Final Round': {
637-
plus: stdDevs,
638-
minus: stdDevs
666+
plus: ciHalfWidths,
667+
minus: ciHalfWidths
639668
}
640669
}
641670
}]
@@ -652,8 +681,11 @@ function createPerformanceBarChart(entries) {
652681
tooltip: {
653682
...ChartConfig.getTooltipFonts(),
654683
callbacks: {
655-
label: (context) =>
656-
`${context.parsed.y.toFixed(1)} ± ${stdDevs[context.dataIndex].toFixed(1)}`
684+
label: (context) => {
685+
const ci = ciHalfWidths[context.dataIndex];
686+
const avg = context.parsed.y;
687+
return `${avg.toFixed(1)} ± ${ci.toFixed(1)} (95% CI)`;
688+
}
657689
}
658690
}
659691
},
@@ -695,7 +727,7 @@ function createPerformanceBarChart(entries) {
695727
const x = bar.x;
696728
const y = bar.y;
697729
const value = dataset.data[index];
698-
const stdDev = stdDevs[index];
730+
const stdDev = ciHalfWidths[index];
699731
const scale = chart.scales.y;
700732

701733
// Calculate error bar positions

0 commit comments

Comments
 (0)