-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathscorer_metadata.rb
More file actions
76 lines (67 loc) · 2.21 KB
/
scorer_metadata.rb
File metadata and controls
76 lines (67 loc) · 2.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env ruby
# frozen_string_literal: true
require "bundler/setup"
require "braintrust"
require "opentelemetry/sdk"
# Example: Scorer Metadata
#
# Scorers can return a Hash with :score and :metadata keys to attach
# structured context alongside the numeric score. The metadata is
# logged on the scorer's span and visible in the Braintrust UI for
# debugging and filtering.
#
# Usage:
# bundle exec ruby examples/eval/scorer_metadata.rb
Braintrust.init
EXPECTED_TOOLS = {
"What's the weather?" => {name: "get_weather", args: ["location"]},
"Book a flight to Paris" => {name: "book_flight", args: ["destination", "date"]},
"Send an email to Bob" => {name: "send_email", args: ["recipient", "subject", "body"]}
}
# Simulated tool-calling model
def pick_tool(input)
case input
when /weather/i then {name: "get_weather", args: ["location"]}
when /flight/i then {name: "book_flight", args: ["destination"]} # missing "date"
when /email/i then {name: "wrong_tool", args: []}
else {name: "unknown", args: []}
end
end
# Scorer that returns structured metadata explaining *why* a score was given
tool_accuracy = Braintrust::Scorer.new("tool_accuracy") { |expected:, output:|
expected_name = expected[:name]
actual_name = output[:name]
expected_args = expected[:args]
actual_args = output[:args]
if actual_name != expected_name
{
score: 0.0,
metadata: {
failure_type: "wrong_tool",
reason: "Expected tool '#{expected_name}' but got '#{actual_name}'"
}
}
else
missing_args = expected_args - actual_args
if missing_args.empty?
{score: 1.0, metadata: {failure_type: nil, reason: "Correct tool and arguments"}}
else
{
score: 0.5,
metadata: {
failure_type: "missing_arguments",
reason: "Correct tool '#{expected_name}' but missing args: #{missing_args.join(", ")}",
missing_args: missing_args
}
}
end
end
}
Braintrust::Eval.run(
project: "ruby-sdk-examples",
experiment: "scorer-metadata-example",
cases: EXPECTED_TOOLS.map { |input, expected| {input: input, expected: expected} },
task: ->(input:) { pick_tool(input) },
scorers: [tool_accuracy]
)
OpenTelemetry.tracer_provider.shutdown