braintrust-sdk-ruby/examples/eval/multi_score.rb at main · braintrustdata/braintrust-sdk-ruby · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "braintrust"
require "opentelemetry/sdk"

# Example: Multi-Score Scorers
#
# A scorer can return an Array of score hashes to emit multiple named metrics
# from a single scorer call. Each hash must have a :name and :score key; an
# optional :metadata key attaches structured context to that metric.
#
# This is useful when several dimensions of quality (e.g. correctness,
# completeness, format) can be computed together — sharing one inference call
# or one pass over the output — rather than running separate scorers.
#
# Two patterns are shown:
#
#   1. Block-based (Braintrust::Scorer.new):
#      Pass a block that returns an Array. Good for concise, one-off scorers.
#
#   2. Class-based (include Braintrust::Scorer):
#      Define a class with a #call method. Good for reusable scorers that
#      share helper logic across multiple metrics.
#
# Usage:
#   bundle exec ruby examples/eval/multi_score.rb

Braintrust.init

# ---------------------------------------------------------------------------
# Task: summarise a list of facts
# ---------------------------------------------------------------------------
FACTS = {
  "The sky is blue and clouds are white." => {
    key_terms: %w[sky blue clouds white],
    max_words: 10
  },
  "Ruby was created by Matz in 1995." => {
    key_terms: %w[ruby matz 1995],
    max_words: 8
  },
  "The Pacific Ocean is the largest ocean on Earth." => {
    key_terms: %w[pacific largest ocean earth],
    max_words: 10
  }
}

# Simulated summariser (replace with a real LLM call in production)
def summarise(text)
  # Naive: drop words over the limit and lowercase
  text.split.first(8).join(" ").downcase
end

# ---------------------------------------------------------------------------
# Pattern 1: block-based multi-score scorer
#
# Returns three metrics in one pass:
#   - coverage:    fraction of key terms present in the summary
#   - conciseness: 1.0 if under the word limit, else 0.0
#   - lowercase:   1.0 if the summary is fully lowercased
# ---------------------------------------------------------------------------
summary_quality = Braintrust::Scorer.new("summary_quality") do |output:, expected:|
  words = output.to_s.downcase.split
  key_terms = expected[:key_terms]
  max_words = expected[:max_words]

  covered = key_terms.count { |t| words.include?(t) }
  coverage_score = key_terms.empty? ? 1.0 : covered.to_f / key_terms.size

  [
    {
      name: "coverage",
      score: coverage_score,
      metadata: {covered: covered, total: key_terms.size, missing: key_terms - words}
    },
    {
      name: "conciseness",
      score: (words.size <= max_words) ? 1.0 : 0.0,
      metadata: {word_count: words.size, limit: max_words}
    },
    {
      name: "lowercase",
      score: (output.to_s == output.to_s.downcase) ? 1.0 : 0.0
    }
  ]
end

# ---------------------------------------------------------------------------
# Pattern 2: class-based multi-score scorer
#
# Include Braintrust::Scorer and define #call. The class name is used as the
# scorer name by default; override #name to customise it.
#
# Returns two metrics:
#   - ends_with_period: checks punctuation
#   - no_first_person:  checks for avoided first-person pronouns
# ---------------------------------------------------------------------------
class StyleChecker
  include Braintrust::Scorer

  FIRST_PERSON = %w[i me my myself we us our].freeze

  def call(output:, **)
    text = output.to_s
    words = text.downcase.split(/\W+/)
    fp_words = words & FIRST_PERSON

    [
      {
        name: "ends_with_period",
        score: text.strip.end_with?(".") ? 1.0 : 0.0
      },
      {
        name: "no_first_person",
        score: fp_words.empty? ? 1.0 : 0.0,
        metadata: {found: fp_words}
      }
    ]
  end
end

Braintrust::Eval.run(
  project: "ruby-sdk-examples",
  experiment: "multi-score-example",
  cases: FACTS.map { |text, expected| {input: text, expected: expected} },
  task: ->(input:) { summarise(input) },
  scorers: [summary_quality, StyleChecker.new]
)

OpenTelemetry.tracer_provider.shutdown