-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathtrace_scoring.rb
More file actions
94 lines (83 loc) · 3.11 KB
/
trace_scoring.rb
File metadata and controls
94 lines (83 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env ruby
# frozen_string_literal: true
# Example: Trace Scoring
#
# Scorers can declare a `trace:` keyword to inspect the full evaluation trace —
# all spans generated by the task (LLM calls, tool usage, etc.).
#
# This example runs an eval where the task splits a list of items into batches,
# asks the LLM to count the fruit in each batch separately, then sums the results.
# A trace scorer inspects the individual LLM responses to verify the per-batch
# counts add up to the expected total — something you can only check by looking
# at the intermediate LLM calls, not just the final output.
#
# Usage:
# OPENAI_API_KEY=your-key bundle exec appraisal openai ruby examples/eval/trace_scoring.rb
require "bundler/setup"
require "braintrust"
require "openai"
unless ENV["OPENAI_API_KEY"]
puts "Error: OPENAI_API_KEY environment variable is required"
exit 1
end
Braintrust.init(blocking_login: true)
client = OpenAI::Client.new(api_key: ENV["OPENAI_API_KEY"])
# Task: split items into batches, count fruit in each batch via separate LLM calls,
# then return the total.
task = Braintrust::Task.new("count_fruit") do |input:|
items = input[:items]
batch_size = input[:batch_size]
total = 0
items.each_slice(batch_size) do |batch|
response = client.chat.completions.create(
model: "gpt-4o-mini",
messages: [
{role: "system", content: "You count fruit. Respond with only a number."},
{role: "user", content: "How many of the following items are fruit? #{batch.join(", ")}"}
],
max_tokens: 5
)
total += response.choices.first.message.content.strip.to_i
end
total.to_s
end
# Trace scorer: check whether the per-batch LLM counts sum to the expected total.
# Returns a proportional score: 1.0 when exact, penalized for each over- or under-count.
# This checks something you can't see from the final output alone — whether each
# individual batch call returned the right count.
batch_sums_match = Braintrust::Scorer.new("batch_sums_match") do |expected:, trace:|
llm_spans = trace.spans(span_type: "llm")
per_batch_sum = llm_spans.sum do |span|
out = span["output"] || span[:output]
# BTQL returns output as a flat array of choices; other formats wrap in {choices: [...]}
choices = out.is_a?(Array) ? out : (out && (out["choices"] || out[:choices]))
choice = choices&.first
msg = choice && (choice["message"] || choice[:message])
text = msg && (msg["content"] || msg[:content])
text.to_i
end
expected_count = expected.to_i
error = (per_batch_sum - expected_count).abs
[1.0 - error.to_f / expected_count, 0.0].max
end
Braintrust::Eval.run(
project: "ruby-sdk-examples",
experiment: "trace-scoring-example",
cases: [
{
input: {items: ["apple", "carrot", "banana", "broccoli"], batch_size: 2},
expected: "2"
},
{
input: {items: ["grape", "spinach", "mango", "potato", "strawberry", "cucumber"], batch_size: 3},
expected: "3"
},
{
input: {items: ["orange", "tomato"], batch_size: 1},
expected: "1"
}
],
task: task,
scorers: [batch_sums_match]
)
OpenTelemetry.tracer_provider.shutdown