braintrust-sdk-ruby/examples/eval/remote_functions.rb at main · braintrustdata/braintrust-sdk-ruby · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env ruby
# frozen_string_literal: true

# Example: Using remote functions (server-side prompts) in evaluations
#
# This example demonstrates how to:
# 1. Create a remote task function (prompt) on the Braintrust server
# 2. Create a remote scorer function with LLM classifier and choices
# 3. Use both remote task and scorer in Eval.run
#
# Benefits of remote functions:
# - Centralized prompt management
# - Version control for prompts
# - No need to deploy prompt changes with code
# - Consistent prompt execution across environments
# - Remote scorers use choice_scores for deterministic scoring

require "bundler/setup"
require "braintrust"
require "braintrust/eval"
require "braintrust/functions"

# Initialize Braintrust with tracing enabled (default)
Braintrust.init

project_name = "ruby-sdk-examples"

# First, let's create remote functions (task + scorer) on the server
# In practice, you would create these once via the UI or API
puts "Creating remote functions..."

api = Braintrust::API.new
function_slug = "food-classifier-#{Time.now.to_i}"

api.functions.create(
  project_name: project_name,
  slug: function_slug,
  function_data: {type: "prompt"},
  prompt_data: {
    prompt: {
      type: "chat",
      messages: [
        {
          role: "system",
          content: "You are a food classifier. Classify the input as 'fruit' or 'vegetable'. Return ONLY the classification, nothing else."
        },
        {
          role: "user",
          content: "Classify: {{input}}"
        }
      ]
    },
    options: {
      model: "gpt-4o-mini",
      params: {temperature: 0}
    }
  }
)

puts "Created task function: #{function_slug}"

# Create a remote scorer function (uses LLM classifier with choices)
scorer_slug = "classification-scorer-#{Time.now.to_i}"
api.functions.create(
  project_name: project_name,
  slug: scorer_slug,
  function_data: {type: "prompt"},
  prompt_data: {
    parser: {
      type: "llm_classifier",
      use_cot: true,
      choice_scores: {
        "correct" => 1.0,
        "incorrect" => 0.0
      }
    },
    prompt: {
      type: "chat",
      messages: [
        {
          role: "system",
          content: "You are a scorer evaluating food classifications."
        },
        {
          role: "user",
          content: "Expected: {{expected}}\nActual output: {{output}}\n\nDoes the output correctly classify the food? Choose 'correct' if it matches (case-insensitive), otherwise 'incorrect'."
        }
      ]
    },
    options: {
      model: "gpt-4o-mini",
      params: {temperature: 0, use_cache: true}
    }
  }
)
puts "Created scorer function: #{scorer_slug}"

# Now use the remote functions in Eval.run
puts "\nRunning evaluation with remote functions..."

# Get references to the remote functions
task = Braintrust::Functions.task(
  project: project_name,
  slug: function_slug
)

# Define test cases
cases = [
  {input: "apple", expected: "fruit"},
  {input: "banana", expected: "fruit"},
  {input: "carrot", expected: "vegetable"},
  {input: "broccoli", expected: "vegetable"}
]

# Run the evaluation
# Both the task AND scorer will execute on the Braintrust server, not locally
# Scorers can be referenced by name — they're resolved from the project automatically
Braintrust::Eval.run(
  project: project_name,
  experiment: "remote-function-demo",
  cases: cases,
  task: task,
  scorers: [scorer_slug]
)

# Flush all spans to ensure they're exported
OpenTelemetry.tracer_provider.shutdown