braintrust-sdk-ruby/examples/eval/trace_scoring.rb at main · braintrustdata/braintrust-sdk-ruby · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env ruby
# frozen_string_literal: true

# Example: Trace Scoring
#
# Scorers can declare a `trace:` keyword to inspect the full evaluation trace —
# all spans generated by the task (LLM calls, tool usage, etc.).
#
# This example runs an eval where the task splits a list of items into batches,
# asks the LLM to count the fruit in each batch separately, then sums the results.
# A trace scorer inspects the individual LLM responses to verify the per-batch
# counts add up to the expected total — something you can only check by looking
# at the intermediate LLM calls, not just the final output.
#
# Usage:
#   OPENAI_API_KEY=your-key bundle exec appraisal openai ruby examples/eval/trace_scoring.rb

require "bundler/setup"
require "braintrust"
require "openai"

unless ENV["OPENAI_API_KEY"]
  puts "Error: OPENAI_API_KEY environment variable is required"
  exit 1
end

Braintrust.init(blocking_login: true)

client = OpenAI::Client.new(api_key: ENV["OPENAI_API_KEY"])

# Task: split items into batches, count fruit in each batch via separate LLM calls,
# then return the total.
task = Braintrust::Task.new("count_fruit") do |input:|
  items = input[:items]
  batch_size = input[:batch_size]

  total = 0
  items.each_slice(batch_size) do |batch|
    response = client.chat.completions.create(
      model: "gpt-4o-mini",
      messages: [
        {role: "system", content: "You count fruit. Respond with only a number."},
        {role: "user", content: "How many of the following items are fruit? #{batch.join(", ")}"}
      ],
      max_tokens: 5
    )
    total += response.choices.first.message.content.strip.to_i
  end

  total.to_s
end

# Trace scorer: check whether the per-batch LLM counts sum to the expected total.
# Returns a proportional score: 1.0 when exact, penalized for each over- or under-count.
# This checks something you can't see from the final output alone — whether each
# individual batch call returned the right count.
batch_sums_match = Braintrust::Scorer.new("batch_sums_match") do |expected:, trace:|
  llm_spans = trace.spans(span_type: "llm")
  per_batch_sum = llm_spans.sum do |span|
    out = span["output"] || span[:output]
    # BTQL returns output as a flat array of choices; other formats wrap in {choices: [...]}
    choices = out.is_a?(Array) ? out : (out && (out["choices"] || out[:choices]))
    choice = choices&.first
    msg = choice && (choice["message"] || choice[:message])
    text = msg && (msg["content"] || msg[:content])
    text.to_i
  end
  expected_count = expected.to_i
  error = (per_batch_sum - expected_count).abs
  [1.0 - error.to_f / expected_count, 0.0].max
end

Braintrust::Eval.run(
  project: "ruby-sdk-examples",
  experiment: "trace-scoring-example",
  cases: [
    {
      input: {items: ["apple", "carrot", "banana", "broccoli"], batch_size: 2},
      expected: "2"
    },
    {
      input: {items: ["grape", "spinach", "mango", "potato", "strawberry", "cucumber"], batch_size: 3},
      expected: "3"
    },
    {
      input: {items: ["orange", "tomato"], batch_size: 1},
      expected: "1"
    }
  ],
  task: task,
  scorers: [batch_sums_match]
)

OpenTelemetry.tracer_provider.shutdown