braintrust-sdk-ruby/examples/eval/dataset.rb at main · braintrustdata/braintrust-sdk-ruby · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env ruby
# frozen_string_literal: true

# Example: Running an evaluation against a dataset
#
# This example demonstrates:
# 1. Creating a dataset with test cases
# 2. Running an evaluation using the dataset
# 3. Different ways to specify datasets (string, hash with options)
#
# Usage:
#   ruby examples/eval/dataset.rb

require "bundler/setup"
require "braintrust"

Braintrust.init
api = Braintrust::API.new  # Uses global state
at_exit { OpenTelemetry.tracer_provider.shutdown }

# Project name
project_name = "ruby-sdk-examples"

# Create a dataset with test cases
dataset_name = "string-transform-#{Time.now.to_i}"
puts "Creating dataset '#{dataset_name}'..."

result = api.datasets.create(
  name: dataset_name,
  project_name: project_name,
  description: "Example dataset for string transformation evaluation"
)
dataset_id = result["dataset"]["id"]

# Insert test cases into the dataset
test_cases = [
  {input: "hello", expected: "HELLO"},
  {input: "world", expected: "WORLD"},
  {input: "ruby", expected: "RUBY"},
  {input: "braintrust", expected: "BRAINTRUST"}
]

api.datasets.insert(id: dataset_id, events: test_cases)

# Define task: simple string upcase
task = ->(input:) { input.upcase }

# Define scorer: exact match (named for clarity in results)
scorer = Braintrust::Scorer.new("exact_match") { |expected:, output:|
  (output == expected) ? 1.0 : 0.0
}

# Example 1: Run eval with dataset as string (uses same project)
puts "\n" + "=" * 60
puts "Example 1: Dataset as string (same project)"
puts "=" * 60

Braintrust::Eval.run(
  project: project_name,
  experiment: "dataset-eval-string",
  dataset: dataset_name,  # Simple string - fetches from same project
  task: task,
  scorers: [scorer]
)

# Example 2: Run eval with dataset as hash (explicit project)
puts "\n" + "=" * 60
puts "Example 2: Dataset as hash with explicit project"
puts "=" * 60

Braintrust::Eval.run(
  project: project_name,
  experiment: "dataset-eval-hash",
  dataset: {
    name: dataset_name,
    project: project_name  # Explicit project
  },
  task: task,
  scorers: [scorer]
)

# Example 3: Run eval with dataset by ID
puts "\n" + "=" * 60
puts "Example 3: Dataset by ID"
puts "=" * 60

Braintrust::Eval.run(
  project: project_name,
  experiment: "dataset-eval-id",
  dataset: {id: dataset_id},  # Fetch by ID
  task: task,
  scorers: [scorer]
)

# Example 4: Run eval with dataset limit
puts "\n" + "=" * 60
puts "Example 4: Dataset with record limit"
puts "=" * 60

Braintrust::Eval.run(
  project: project_name,
  experiment: "dataset-eval-limit",
  dataset: {
    name: dataset_name,
    project: project_name,
    limit: 2  # Only use first 2 records
  },
  task: task,
  scorers: [scorer]
)