Skip to content

Commit 1d1f642

Browse files
authored
fix(eval): Add agent tool trajectory eval (#854)
1 parent a6bd4be commit 1d1f642

File tree

5 files changed

+221
-24
lines changed

5 files changed

+221
-24
lines changed

packages/sample-app/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
"run:sample_experiment": "npm run build && node dist/src/sample_experiment.js",
4242
"run:github_experiment": "npm run build && node dist/src/sample_github_experiment.js",
4343
"run:security_experiment": "npm run build && node dist/src/sample_security_experiment.js",
44+
"run:agent_tool_trajectory": "npm run build && node dist/src/sample_agent_tool_trajectory.js",
4445
"run:mcp": "npm run build && node dist/src/sample_mcp.js",
4546
"run:mcp:real": "npm run build && node dist/src/sample_mcp_real.js",
4647
"run:mcp:working": "npm run build && node dist/src/sample_mcp_working.js",
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
/**
2+
* Agent Tool Trajectory Experiment
3+
*
4+
* This example demonstrates Traceloop's agent tool trajectory evaluator:
5+
* - Agent Tool Trajectory: Validates the agent tool trajectory
6+
*
7+
* This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory.
8+
*/
9+
10+
import * as traceloop from "@traceloop/node-server-sdk";
11+
import type {
12+
ExperimentTaskFunction,
13+
TaskInput,
14+
TaskOutput,
15+
} from "@traceloop/node-server-sdk";
16+
17+
import "dotenv/config";
18+
19+
const main = async () => {
20+
console.log("Agent Tool Trajectory Experiment\n");
21+
22+
traceloop.initialize({
23+
appName: "agent_tool_trajectory_experiment",
24+
apiKey: process.env.TRACELOOP_API_KEY,
25+
disableBatch: true,
26+
traceloopSyncEnabled: true,
27+
});
28+
29+
try {
30+
await traceloop.waitForInitialization();
31+
} catch (error) {
32+
console.error(
33+
"Failed to initialize Traceloop SDK:",
34+
error instanceof Error ? error.message : String(error),
35+
);
36+
process.exit(1);
37+
}
38+
39+
const client = traceloop.getClient();
40+
if (!client) {
41+
console.error("Failed to initialize Traceloop client");
42+
return;
43+
}
44+
45+
/**
46+
* Task function for agent tool trajectory evaluation
47+
*/
48+
const agentEvaluatorsTask: ExperimentTaskFunction = async (
49+
row: TaskInput,
50+
): Promise<TaskOutput> => {
51+
const executedToolCalls = (row.actual as string) || "";
52+
const defaultExpected =
53+
"[{'name': 'search', 'input': {'query': 'weather'}}, " +
54+
"{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, " +
55+
"{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]";
56+
const expectedToolCalls = (row.expected as string) || defaultExpected;
57+
58+
return {
59+
executed_tool_calls: executedToolCalls,
60+
expected_tool_calls: expectedToolCalls,
61+
};
62+
};
63+
64+
console.log("\n" + "=".repeat(80));
65+
console.log("AGENT TOOL TRAJECTORY EXPERIMENT");
66+
console.log("=".repeat(80) + "\n");
67+
console.log(
68+
"This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n",
69+
);
70+
console.log("1. Agent Tool Trajectory - Validates the agent tool trajectory");
71+
console.log("\n" + "-".repeat(80) + "\n");
72+
73+
// Configure agent evaluators
74+
// Using the evaluator slug directly - TypeScript should infer it's valid
75+
const evaluators = [
76+
{
77+
name: "agent-tool-trajectory",
78+
config: {
79+
input_params_sensitive: true,
80+
mismatch_sensitive: false,
81+
order_sensitive: false,
82+
threshold: 0.7,
83+
},
84+
},
85+
];
86+
87+
console.log("Running experiment with evaluators:");
88+
evaluators.forEach((evaluator) => {
89+
console.log(` - ${evaluator.name}`);
90+
});
91+
92+
console.log("\n" + "-".repeat(80) + "\n");
93+
94+
try {
95+
// Run the experiment
96+
// Note: You'll need to create a dataset with appropriate test cases for agents
97+
const result = await client.experiment.run(agentEvaluatorsTask, {
98+
datasetSlug: "agent-tool-trajectory", // Set a dataset slug that exists in the traceloop platform
99+
datasetVersion: "v1",
100+
evaluators,
101+
experimentSlug: "agent-tool-trajectory-exp",
102+
stopOnError: false,
103+
waitForResults: true,
104+
});
105+
106+
console.log("\n" + "=".repeat(80));
107+
console.log("Agent tool trajectory experiment completed!");
108+
console.log("=".repeat(80) + "\n");
109+
110+
if ("taskResults" in result) {
111+
console.log("Results summary:");
112+
console.log(` - Total rows processed: ${result.taskResults.length}`);
113+
console.log(` - Errors encountered: ${result.errors.length}`);
114+
console.log(` - Experiment ID: ${result.experimentId}`);
115+
116+
if (result.errors.length > 0) {
117+
console.log("\nErrors:");
118+
result.errors.forEach((error) => {
119+
console.log(` - ${error}`);
120+
});
121+
}
122+
}
123+
} catch (error) {
124+
console.error(
125+
"❌ Error in experiment operations:",
126+
error instanceof Error ? error.message : String(error),
127+
);
128+
if (error instanceof Error && error.stack) {
129+
console.error("Stack trace:", error.stack);
130+
}
131+
}
132+
};
133+
134+
// Error handling for the main function
135+
main().catch((error) => {
136+
console.error("💥 Application failed:", error.message);
137+
process.exit(1);
138+
});

packages/traceloop-sdk/src/lib/generated/evaluators/mbt-evaluators.ts

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ import { EVALUATOR_SLUGS, EVALUATOR_SCHEMAS, isValidEvaluatorSlug, type Evaluato
88
// Config type aliases from generated OpenAPI types
99
export type AgentFlowQualityConfig = components['schemas']['request.AgentFlowQualityRequest']['config'];
1010
export type AgentGoalCompletenessConfig = components['schemas']['request.AgentGoalCompletenessRequest']['config'];
11+
export type AgentToolTrajectoryConfig = components['schemas']['request.AgentToolTrajectoryRequest']['config'];
1112
export type ContextRelevanceConfig = components['schemas']['request.ContextRelevanceRequest']['config'];
12-
export type ConversationQualityConfig = components['schemas']['request.ConversationQualityRequest']['config'];
13-
export type IntentChangeConfig = components['schemas']['request.IntentChangeRequest']['config'];
1413
export type JsonValidatorConfig = components['schemas']['request.JSONValidatorRequest']['config'];
1514
export type PiiDetectorConfig = components['schemas']['request.PIIDetectorRequest']['config'];
1615
export type PlaceholderRegexConfig = components['schemas']['request.PlaceholderRegexRequest']['config'];
@@ -167,6 +166,22 @@ export class EvaluatorMadeByTraceloop {
167166
return createEvaluator('agent-tool-error-detector');
168167
}
169168

169+
/**
170+
* Compare actual tool calls against expected reference tool calls
171+
172+
**Request Body:**
173+
- `input.executed_tool_calls` (string, required): JSON array of actual tool calls made by the agent
174+
- `input.expected_tool_calls` (string, required): JSON array of expected/reference tool calls
175+
- `config.threshold` (float, optional): Score threshold for pass/fail determination (default: 0.5)
176+
- `config.mismatch_sensitive` (bool, optional): Whether tool calls must match exactly (default: false)
177+
- `config.order_sensitive` (bool, optional): Whether order of tool calls matters (default: false)
178+
- `config.input_params_sensitive` (bool, optional): Whether to compare input parameters (default: true)
179+
* Required task output fields: executed_tool_calls, expected_tool_calls
180+
*/
181+
static agentToolTrajectory(config?: AgentToolTrajectoryConfig): EvaluatorWithConfig {
182+
return createEvaluator('agent-tool-trajectory', { config: config as Record<string, unknown> });
183+
}
184+
170185
/**
171186
* Evaluate whether the answer is complete and contains all the necessary information
172187
@@ -247,11 +262,10 @@ export class EvaluatorMadeByTraceloop {
247262
**Request Body:**
248263
- `input.prompts` (string, required): JSON array of prompts in the conversation
249264
- `input.completions` (string, required): JSON array of completions in the conversation
250-
- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)
251265
* Required task output fields: completions, prompts
252266
*/
253-
static conversationQuality(config?: ConversationQualityConfig): EvaluatorWithConfig {
254-
return createEvaluator('conversation-quality', { config: config as Record<string, unknown> });
267+
static conversationQuality(): EvaluatorWithConfig {
268+
return createEvaluator('conversation-quality');
255269
}
256270

257271
/**
@@ -267,6 +281,18 @@ export class EvaluatorMadeByTraceloop {
267281
return createEvaluator('faithfulness');
268282
}
269283

284+
/**
285+
* Compare two HTML documents for structural and content similarity
286+
287+
**Request Body:**
288+
- `input.html1` (string, required): The first HTML document to compare
289+
- `input.html2` (string, required): The second HTML document to compare
290+
* Required task output fields: html1, html2
291+
*/
292+
static htmlComparison(): EvaluatorWithConfig {
293+
return createEvaluator('html-comparison');
294+
}
295+
270296
/**
271297
* Evaluate how well responses follow given instructions
272298
@@ -285,11 +311,10 @@ export class EvaluatorMadeByTraceloop {
285311
**Request Body:**
286312
- `input.prompts` (string, required): JSON array of prompts in the conversation
287313
- `input.completions` (string, required): JSON array of completions in the conversation
288-
- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)
289314
* Required task output fields: completions, prompts
290315
*/
291-
static intentChange(config?: IntentChangeConfig): EvaluatorWithConfig {
292-
return createEvaluator('intent-change', { config: config as Record<string, unknown> });
316+
static intentChange(): EvaluatorWithConfig {
317+
return createEvaluator('intent-change');
293318
}
294319

295320
/**

packages/traceloop-sdk/src/lib/generated/evaluators/registry.ts

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@ export interface EvaluatorSchema {
88
description?: string;
99
}
1010

11-
export type EvaluatorSlug = 'agent-efficiency' | 'agent-flow-quality' | 'agent-goal-accuracy' | 'agent-goal-completeness' | 'agent-tool-error-detector' | 'answer-completeness' | 'answer-correctness' | 'answer-relevancy' | 'char-count' | 'char-count-ratio' | 'context-relevance' | 'conversation-quality' | 'faithfulness' | 'instruction-adherence' | 'intent-change' | 'json-validator' | 'perplexity' | 'pii-detector' | 'placeholder-regex' | 'profanity-detector' | 'prompt-injection' | 'prompt-perplexity' | 'regex-validator' | 'secrets-detector' | 'semantic-similarity' | 'sexism-detector' | 'sql-validator' | 'tone-detection' | 'topic-adherence' | 'toxicity-detector' | 'uncertainty-detector' | 'word-count' | 'word-count-ratio';
11+
export type EvaluatorSlug = 'agent-efficiency' | 'agent-flow-quality' | 'agent-goal-accuracy' | 'agent-goal-completeness' | 'agent-tool-error-detector' | 'agent-tool-trajectory' | 'answer-completeness' | 'answer-correctness' | 'answer-relevancy' | 'char-count' | 'char-count-ratio' | 'context-relevance' | 'conversation-quality' | 'faithfulness' | 'html-comparison' | 'instruction-adherence' | 'intent-change' | 'json-validator' | 'perplexity' | 'pii-detector' | 'placeholder-regex' | 'profanity-detector' | 'prompt-injection' | 'prompt-perplexity' | 'regex-validator' | 'secrets-detector' | 'semantic-similarity' | 'sexism-detector' | 'sql-validator' | 'tone-detection' | 'topic-adherence' | 'toxicity-detector' | 'uncertainty-detector' | 'word-count' | 'word-count-ratio';
1212

1313
export const EVALUATOR_SLUGS: EvaluatorSlug[] = [
1414
'agent-efficiency',
1515
'agent-flow-quality',
1616
'agent-goal-accuracy',
1717
'agent-goal-completeness',
1818
'agent-tool-error-detector',
19+
'agent-tool-trajectory',
1920
'answer-completeness',
2021
'answer-correctness',
2122
'answer-relevancy',
@@ -24,6 +25,7 @@ export const EVALUATOR_SLUGS: EvaluatorSlug[] = [
2425
'context-relevance',
2526
'conversation-quality',
2627
'faithfulness',
28+
'html-comparison',
2729
'instruction-adherence',
2830
'intent-change',
2931
'json-validator',
@@ -77,6 +79,12 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
7779
optionalConfigFields: [],
7880
description: "Detect errors or failures during tool execution\n\n**Request Body:**\n- `input.tool_input` (string, required): JSON string of the tool input\n- `input.tool_output` (string, required): JSON string of the tool output",
7981
},
82+
'agent-tool-trajectory': {
83+
slug: 'agent-tool-trajectory',
84+
requiredInputFields: ['executed_tool_calls', 'expected_tool_calls'],
85+
optionalConfigFields: ['input_params_sensitive', 'mismatch_sensitive', 'order_sensitive', 'threshold'],
86+
description: "Compare actual tool calls against expected reference tool calls\n\n**Request Body:**\n- `input.executed_tool_calls` (string, required): JSON array of actual tool calls made by the agent\n- `input.expected_tool_calls` (string, required): JSON array of expected/reference tool calls\n- `config.threshold` (float, optional): Score threshold for pass/fail determination (default: 0.5)\n- `config.mismatch_sensitive` (bool, optional): Whether tool calls must match exactly (default: false)\n- `config.order_sensitive` (bool, optional): Whether order of tool calls matters (default: false)\n- `config.input_params_sensitive` (bool, optional): Whether to compare input parameters (default: true)",
87+
},
8088
'answer-completeness': {
8189
slug: 'answer-completeness',
8290
requiredInputFields: ['completion', 'context', 'question'],
@@ -116,15 +124,21 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
116124
'conversation-quality': {
117125
slug: 'conversation-quality',
118126
requiredInputFields: ['completions', 'prompts'],
119-
optionalConfigFields: ['model'],
120-
description: "Evaluate conversation quality based on tone, clarity, flow, responsiveness, and transparency\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation\n- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)",
127+
optionalConfigFields: [],
128+
description: "Evaluate conversation quality based on tone, clarity, flow, responsiveness, and transparency\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation",
121129
},
122130
'faithfulness': {
123131
slug: 'faithfulness',
124132
requiredInputFields: ['completion', 'context', 'question'],
125133
optionalConfigFields: [],
126134
description: "Check if a completion is faithful to the provided context\n\n**Request Body:**\n- `input.completion` (string, required): The LLM completion to check for faithfulness\n- `input.context` (string, required): The context that the completion should be faithful to\n- `input.question` (string, required): The original question asked",
127135
},
136+
'html-comparison': {
137+
slug: 'html-comparison',
138+
requiredInputFields: ['html1', 'html2'],
139+
optionalConfigFields: [],
140+
description: "Compare two HTML documents for structural and content similarity\n\n**Request Body:**\n- `input.html1` (string, required): The first HTML document to compare\n- `input.html2` (string, required): The second HTML document to compare",
141+
},
128142
'instruction-adherence': {
129143
slug: 'instruction-adherence',
130144
requiredInputFields: ['instructions', 'response'],
@@ -134,8 +148,8 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
134148
'intent-change': {
135149
slug: 'intent-change',
136150
requiredInputFields: ['completions', 'prompts'],
137-
optionalConfigFields: ['model'],
138-
description: "Detect changes in user intent between prompts and completions\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation\n- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)",
151+
optionalConfigFields: [],
152+
description: "Detect changes in user intent between prompts and completions\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation",
139153
},
140154
'json-validator': {
141155
slug: 'json-validator',

packages/traceloop-sdk/src/lib/generated/evaluators/types.ts

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ export interface components {
1717
trajectory_prompts: string;
1818
};
1919
"request.AgentFlowQualityRequest": {
20-
config?: components["schemas"]["request.AgentFlowQualityConfigRequest"];
20+
config: components["schemas"]["request.AgentFlowQualityConfigRequest"];
2121
input: components["schemas"]["request.AgentFlowQualityInput"];
2222
};
2323
"request.AgentFlowQualityConfigRequest": {
@@ -71,6 +71,26 @@ export interface components {
7171
/** @example {"status": "success", "results": [{"flight": "AF123", "price": 450}]} */
7272
tool_output: string;
7373
};
74+
"request.AgentToolTrajectoryRequest": {
75+
config?: components["schemas"]["request.AgentToolTrajectoryConfigRequest"];
76+
input: components["schemas"]["request.AgentToolTrajectoryInput"];
77+
};
78+
"request.AgentToolTrajectoryConfigRequest": {
79+
/** @example true */
80+
input_params_sensitive?: boolean;
81+
/** @example false */
82+
mismatch_sensitive?: boolean;
83+
/** @example false */
84+
order_sensitive?: boolean;
85+
/** @example 0.5 */
86+
threshold?: number;
87+
};
88+
"request.AgentToolTrajectoryInput": {
89+
/** @example [{"name": "search", "input": {"query": "weather"}}] */
90+
executed_tool_calls: string;
91+
/** @example [{"name": "search", "input": {"query": "weather"}}] */
92+
expected_tool_calls: string;
93+
};
7494
"request.AnswerCompletenessRequest": {
7595
input: components["schemas"]["request.AnswerCompletenessInput"];
7696
};
@@ -133,13 +153,8 @@ export interface components {
133153
query: string;
134154
};
135155
"request.ConversationQualityRequest": {
136-
config?: components["schemas"]["request.ConversationQualityConfigRequest"];
137156
input: components["schemas"]["request.ConversationQualityInput"];
138157
};
139-
"request.ConversationQualityConfigRequest": {
140-
/** @example gpt-4o */
141-
model?: string;
142-
};
143158
"request.ConversationQualityInput": {
144159
/** @example ["Hi! I'd be happy to assist you today.", "We offer consulting, development, and support services."] */
145160
completions: string;
@@ -157,6 +172,15 @@ export interface components {
157172
/** @example When was the Eiffel Tower built? */
158173
question: string;
159174
};
175+
"request.HtmlComparisonRequest": {
176+
input: components["schemas"]["request.HtmlComparisonInput"];
177+
};
178+
"request.HtmlComparisonInput": {
179+
/** @example <html><body><h1>Hello, world!</h1></body></html> */
180+
html1: string;
181+
/** @example <html><body><h1>Hello, world!</h1></body></html> */
182+
html2: string;
183+
};
160184
"request.InstructionAdherenceRequest": {
161185
input: components["schemas"]["request.InstructionAdherenceInput"];
162186
};
@@ -171,13 +195,8 @@ export interface components {
171195
response: string;
172196
};
173197
"request.IntentChangeRequest": {
174-
config?: components["schemas"]["request.IntentChangeConfigRequest"];
175198
input: components["schemas"]["request.IntentChangeInput"];
176199
};
177-
"request.IntentChangeConfigRequest": {
178-
/** @example gpt-4o */
179-
model?: string;
180-
};
181200
"request.IntentChangeInput": {
182201
/** @example ["Sure, I can help with hotel booking", "No problem, let me search for flights"] */
183202
completions: string;

0 commit comments

Comments
 (0)