fix(eval): Add agent tool trajectory eval (#854)

nina-kollman · web-flow · commit 1d1f6421dc10 · 2026-01-08T17:58:27.000+02:00
diff --git a/packages/sample-app/package.json b/packages/sample-app/package.json
@@ -41,6 +41,7 @@
     "run:sample_experiment": "npm run build && node dist/src/sample_experiment.js",
     "run:github_experiment": "npm run build && node dist/src/sample_github_experiment.js",
     "run:security_experiment": "npm run build && node dist/src/sample_security_experiment.js",
+    "run:agent_tool_trajectory": "npm run build && node dist/src/sample_agent_tool_trajectory.js",
     "run:mcp": "npm run build && node dist/src/sample_mcp.js",
     "run:mcp:real": "npm run build && node dist/src/sample_mcp_real.js",
     "run:mcp:working": "npm run build && node dist/src/sample_mcp_working.js",
diff --git a/packages/sample-app/src/sample_agent_tool_trajectory.ts b/packages/sample-app/src/sample_agent_tool_trajectory.ts
@@ -0,0 +1,138 @@
+/**
+ * Agent Tool Trajectory Experiment
+ *
+ * This example demonstrates Traceloop's agent tool trajectory evaluator:
+ * - Agent Tool Trajectory: Validates the agent tool trajectory
+ *
+ * This evaluator helps ensure your AI agents perform optimally and follow the expected tool trajectory.
+ */
+
+import * as traceloop from "@traceloop/node-server-sdk";
+import type {
+  ExperimentTaskFunction,
+  TaskInput,
+  TaskOutput,
+} from "@traceloop/node-server-sdk";
+
+import "dotenv/config";
+
+const main = async () => {
+  console.log("Agent Tool Trajectory Experiment\n");
+
+  traceloop.initialize({
+    appName: "agent_tool_trajectory_experiment",
+    apiKey: process.env.TRACELOOP_API_KEY,
+    disableBatch: true,
+    traceloopSyncEnabled: true,
+  });
+
+  try {
+    await traceloop.waitForInitialization();
+  } catch (error) {
+    console.error(
+      "Failed to initialize Traceloop SDK:",
+      error instanceof Error ? error.message : String(error),
+    );
+    process.exit(1);
+  }
+
+  const client = traceloop.getClient();
+  if (!client) {
+    console.error("Failed to initialize Traceloop client");
+    return;
+  }
+
+  /**
+   * Task function for agent tool trajectory evaluation
+   */
+  const agentEvaluatorsTask: ExperimentTaskFunction = async (
+    row: TaskInput,
+  ): Promise<TaskOutput> => {
+    const executedToolCalls = (row.actual as string) || "";
+    const defaultExpected =
+      "[{'name': 'search', 'input': {'query': 'weather'}}, " +
+      "{'name': 'book_flight', 'input': {'flight': 'NYC to Paris'}}, " +
+      "{'name': 'get_confirmation', 'input': {'confirmation': 'flight booked'}}]";
+    const expectedToolCalls = (row.expected as string) || defaultExpected;
+
+    return {
+      executed_tool_calls: executedToolCalls,
+      expected_tool_calls: expectedToolCalls,
+    };
+  };
+
+  console.log("\n" + "=".repeat(80));
+  console.log("AGENT TOOL TRAJECTORY EXPERIMENT");
+  console.log("=".repeat(80) + "\n");
+  console.log(
+    "This experiment will test the agent tool trajectory with the agent tool trajectory evaluator:\n",
+  );
+  console.log("1. Agent Tool Trajectory - Validates the agent tool trajectory");
+  console.log("\n" + "-".repeat(80) + "\n");
+
+  // Configure agent evaluators
+  // Using the evaluator slug directly - TypeScript should infer it's valid
+  const evaluators = [
+    {
+      name: "agent-tool-trajectory",
+      config: {
+        input_params_sensitive: true,
+        mismatch_sensitive: false,
+        order_sensitive: false,
+        threshold: 0.7,
+      },
+    },
+  ];
+
+  console.log("Running experiment with evaluators:");
+  evaluators.forEach((evaluator) => {
+    console.log(`  - ${evaluator.name}`);
+  });
+
+  console.log("\n" + "-".repeat(80) + "\n");
+
+  try {
+    // Run the experiment
+    // Note: You'll need to create a dataset with appropriate test cases for agents
+    const result = await client.experiment.run(agentEvaluatorsTask, {
+      datasetSlug: "agent-tool-trajectory", // Set a dataset slug that exists in the traceloop platform
+      datasetVersion: "v1",
+      evaluators,
+      experimentSlug: "agent-tool-trajectory-exp",
+      stopOnError: false,
+      waitForResults: true,
+    });
+
+    console.log("\n" + "=".repeat(80));
+    console.log("Agent tool trajectory experiment completed!");
+    console.log("=".repeat(80) + "\n");
+
+    if ("taskResults" in result) {
+      console.log("Results summary:");
+      console.log(`  - Total rows processed: ${result.taskResults.length}`);
+      console.log(`  - Errors encountered: ${result.errors.length}`);
+      console.log(`  - Experiment ID: ${result.experimentId}`);
+
+      if (result.errors.length > 0) {
+        console.log("\nErrors:");
+        result.errors.forEach((error) => {
+          console.log(`  - ${error}`);
+        });
+      }
+    }
+  } catch (error) {
+    console.error(
+      "❌ Error in experiment operations:",
+      error instanceof Error ? error.message : String(error),
+    );
+    if (error instanceof Error && error.stack) {
+      console.error("Stack trace:", error.stack);
+    }
+  }
+};
+
+// Error handling for the main function
+main().catch((error) => {
+  console.error("💥 Application failed:", error.message);
+  process.exit(1);
+});
diff --git a/packages/traceloop-sdk/src/lib/generated/evaluators/mbt-evaluators.ts b/packages/traceloop-sdk/src/lib/generated/evaluators/mbt-evaluators.ts
@@ -8,9 +8,8 @@ import { EVALUATOR_SLUGS, EVALUATOR_SCHEMAS, isValidEvaluatorSlug, type Evaluato
 // Config type aliases from generated OpenAPI types
 export type AgentFlowQualityConfig = components['schemas']['request.AgentFlowQualityRequest']['config'];
 export type AgentGoalCompletenessConfig = components['schemas']['request.AgentGoalCompletenessRequest']['config'];
+export type AgentToolTrajectoryConfig = components['schemas']['request.AgentToolTrajectoryRequest']['config'];
 export type ContextRelevanceConfig = components['schemas']['request.ContextRelevanceRequest']['config'];
-export type ConversationQualityConfig = components['schemas']['request.ConversationQualityRequest']['config'];
-export type IntentChangeConfig = components['schemas']['request.IntentChangeRequest']['config'];
 export type JsonValidatorConfig = components['schemas']['request.JSONValidatorRequest']['config'];
 export type PiiDetectorConfig = components['schemas']['request.PIIDetectorRequest']['config'];
 export type PlaceholderRegexConfig = components['schemas']['request.PlaceholderRegexRequest']['config'];
@@ -167,6 +166,22 @@ export class EvaluatorMadeByTraceloop {
     return createEvaluator('agent-tool-error-detector');
   }
 
+  /**
+   * Compare actual tool calls against expected reference tool calls
+
+**Request Body:**
+- `input.executed_tool_calls` (string, required): JSON array of actual tool calls made by the agent
+- `input.expected_tool_calls` (string, required): JSON array of expected/reference tool calls
+- `config.threshold` (float, optional): Score threshold for pass/fail determination (default: 0.5)
+- `config.mismatch_sensitive` (bool, optional): Whether tool calls must match exactly (default: false)
+- `config.order_sensitive` (bool, optional): Whether order of tool calls matters (default: false)
+- `config.input_params_sensitive` (bool, optional): Whether to compare input parameters (default: true)
+   * Required task output fields: executed_tool_calls, expected_tool_calls
+   */
+  static agentToolTrajectory(config?: AgentToolTrajectoryConfig): EvaluatorWithConfig {
+    return createEvaluator('agent-tool-trajectory', { config: config as Record<string, unknown> });
+  }
+
   /**
    * Evaluate whether the answer is complete and contains all the necessary information
 
@@ -247,11 +262,10 @@ export class EvaluatorMadeByTraceloop {
 **Request Body:**
 - `input.prompts` (string, required): JSON array of prompts in the conversation
 - `input.completions` (string, required): JSON array of completions in the conversation
-- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)
    * Required task output fields: completions, prompts
    */
-  static conversationQuality(config?: ConversationQualityConfig): EvaluatorWithConfig {
-    return createEvaluator('conversation-quality', { config: config as Record<string, unknown> });
+  static conversationQuality(): EvaluatorWithConfig {
+    return createEvaluator('conversation-quality');
   }
 
   /**
@@ -267,6 +281,18 @@ export class EvaluatorMadeByTraceloop {
     return createEvaluator('faithfulness');
   }
 
+  /**
+   * Compare two HTML documents for structural and content similarity
+
+**Request Body:**
+- `input.html1` (string, required): The first HTML document to compare
+- `input.html2` (string, required): The second HTML document to compare
+   * Required task output fields: html1, html2
+   */
+  static htmlComparison(): EvaluatorWithConfig {
+    return createEvaluator('html-comparison');
+  }
+
   /**
    * Evaluate how well responses follow given instructions
 
@@ -285,11 +311,10 @@ export class EvaluatorMadeByTraceloop {
 **Request Body:**
 - `input.prompts` (string, required): JSON array of prompts in the conversation
 - `input.completions` (string, required): JSON array of completions in the conversation
-- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)
    * Required task output fields: completions, prompts
    */
-  static intentChange(config?: IntentChangeConfig): EvaluatorWithConfig {
-    return createEvaluator('intent-change', { config: config as Record<string, unknown> });
+  static intentChange(): EvaluatorWithConfig {
+    return createEvaluator('intent-change');
   }
 
   /**
diff --git a/packages/traceloop-sdk/src/lib/generated/evaluators/registry.ts b/packages/traceloop-sdk/src/lib/generated/evaluators/registry.ts
@@ -8,14 +8,15 @@ export interface EvaluatorSchema {
   description?: string;
 }
 
-export type EvaluatorSlug = 'agent-efficiency' | 'agent-flow-quality' | 'agent-goal-accuracy' | 'agent-goal-completeness' | 'agent-tool-error-detector' | 'answer-completeness' | 'answer-correctness' | 'answer-relevancy' | 'char-count' | 'char-count-ratio' | 'context-relevance' | 'conversation-quality' | 'faithfulness' | 'instruction-adherence' | 'intent-change' | 'json-validator' | 'perplexity' | 'pii-detector' | 'placeholder-regex' | 'profanity-detector' | 'prompt-injection' | 'prompt-perplexity' | 'regex-validator' | 'secrets-detector' | 'semantic-similarity' | 'sexism-detector' | 'sql-validator' | 'tone-detection' | 'topic-adherence' | 'toxicity-detector' | 'uncertainty-detector' | 'word-count' | 'word-count-ratio';
+export type EvaluatorSlug = 'agent-efficiency' | 'agent-flow-quality' | 'agent-goal-accuracy' | 'agent-goal-completeness' | 'agent-tool-error-detector' | 'agent-tool-trajectory' | 'answer-completeness' | 'answer-correctness' | 'answer-relevancy' | 'char-count' | 'char-count-ratio' | 'context-relevance' | 'conversation-quality' | 'faithfulness' | 'html-comparison' | 'instruction-adherence' | 'intent-change' | 'json-validator' | 'perplexity' | 'pii-detector' | 'placeholder-regex' | 'profanity-detector' | 'prompt-injection' | 'prompt-perplexity' | 'regex-validator' | 'secrets-detector' | 'semantic-similarity' | 'sexism-detector' | 'sql-validator' | 'tone-detection' | 'topic-adherence' | 'toxicity-detector' | 'uncertainty-detector' | 'word-count' | 'word-count-ratio';
 
 export const EVALUATOR_SLUGS: EvaluatorSlug[] = [
   'agent-efficiency',
   'agent-flow-quality',
   'agent-goal-accuracy',
   'agent-goal-completeness',
   'agent-tool-error-detector',
+  'agent-tool-trajectory',
   'answer-completeness',
   'answer-correctness',
   'answer-relevancy',
@@ -24,6 +25,7 @@ export const EVALUATOR_SLUGS: EvaluatorSlug[] = [
   'context-relevance',
   'conversation-quality',
   'faithfulness',
+  'html-comparison',
   'instruction-adherence',
   'intent-change',
   'json-validator',
@@ -77,6 +79,12 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
     optionalConfigFields: [],
     description: "Detect errors or failures during tool execution\n\n**Request Body:**\n- `input.tool_input` (string, required): JSON string of the tool input\n- `input.tool_output` (string, required): JSON string of the tool output",
   },
+  'agent-tool-trajectory': {
+    slug: 'agent-tool-trajectory',
+    requiredInputFields: ['executed_tool_calls', 'expected_tool_calls'],
+    optionalConfigFields: ['input_params_sensitive', 'mismatch_sensitive', 'order_sensitive', 'threshold'],
+    description: "Compare actual tool calls against expected reference tool calls\n\n**Request Body:**\n- `input.executed_tool_calls` (string, required): JSON array of actual tool calls made by the agent\n- `input.expected_tool_calls` (string, required): JSON array of expected/reference tool calls\n- `config.threshold` (float, optional): Score threshold for pass/fail determination (default: 0.5)\n- `config.mismatch_sensitive` (bool, optional): Whether tool calls must match exactly (default: false)\n- `config.order_sensitive` (bool, optional): Whether order of tool calls matters (default: false)\n- `config.input_params_sensitive` (bool, optional): Whether to compare input parameters (default: true)",
+  },
   'answer-completeness': {
     slug: 'answer-completeness',
     requiredInputFields: ['completion', 'context', 'question'],
@@ -116,15 +124,21 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
   'conversation-quality': {
     slug: 'conversation-quality',
     requiredInputFields: ['completions', 'prompts'],
-    optionalConfigFields: ['model'],
-    description: "Evaluate conversation quality based on tone, clarity, flow, responsiveness, and transparency\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation\n- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)",
+    optionalConfigFields: [],
+    description: "Evaluate conversation quality based on tone, clarity, flow, responsiveness, and transparency\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation",
   },
   'faithfulness': {
     slug: 'faithfulness',
     requiredInputFields: ['completion', 'context', 'question'],
     optionalConfigFields: [],
     description: "Check if a completion is faithful to the provided context\n\n**Request Body:**\n- `input.completion` (string, required): The LLM completion to check for faithfulness\n- `input.context` (string, required): The context that the completion should be faithful to\n- `input.question` (string, required): The original question asked",
   },
+  'html-comparison': {
+    slug: 'html-comparison',
+    requiredInputFields: ['html1', 'html2'],
+    optionalConfigFields: [],
+    description: "Compare two HTML documents for structural and content similarity\n\n**Request Body:**\n- `input.html1` (string, required): The first HTML document to compare\n- `input.html2` (string, required): The second HTML document to compare",
+  },
   'instruction-adherence': {
     slug: 'instruction-adherence',
     requiredInputFields: ['instructions', 'response'],
@@ -134,8 +148,8 @@ export const EVALUATOR_SCHEMAS: Record<EvaluatorSlug, EvaluatorSchema> = {
   'intent-change': {
     slug: 'intent-change',
     requiredInputFields: ['completions', 'prompts'],
-    optionalConfigFields: ['model'],
-    description: "Detect changes in user intent between prompts and completions\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation\n- `config.model` (string, optional): Model to use for evaluation (default: gpt-4o)",
+    optionalConfigFields: [],
+    description: "Detect changes in user intent between prompts and completions\n\n**Request Body:**\n- `input.prompts` (string, required): JSON array of prompts in the conversation\n- `input.completions` (string, required): JSON array of completions in the conversation",
   },
   'json-validator': {
     slug: 'json-validator',
diff --git a/packages/traceloop-sdk/src/lib/generated/evaluators/types.ts b/packages/traceloop-sdk/src/lib/generated/evaluators/types.ts
@@ -17,7 +17,7 @@ export interface components {
             trajectory_prompts: string;
         };
         "request.AgentFlowQualityRequest": {
-            config?: components["schemas"]["request.AgentFlowQualityConfigRequest"];
+            config: components["schemas"]["request.AgentFlowQualityConfigRequest"];
             input: components["schemas"]["request.AgentFlowQualityInput"];
         };
         "request.AgentFlowQualityConfigRequest": {
@@ -71,6 +71,26 @@ export interface components {
             /** @example {"status": "success", "results": [{"flight": "AF123", "price": 450}]} */
             tool_output: string;
         };
+        "request.AgentToolTrajectoryRequest": {
+            config?: components["schemas"]["request.AgentToolTrajectoryConfigRequest"];
+            input: components["schemas"]["request.AgentToolTrajectoryInput"];
+        };
+        "request.AgentToolTrajectoryConfigRequest": {
+            /** @example true */
+            input_params_sensitive?: boolean;
+            /** @example false */
+            mismatch_sensitive?: boolean;
+            /** @example false */
+            order_sensitive?: boolean;
+            /** @example 0.5 */
+            threshold?: number;
+        };
+        "request.AgentToolTrajectoryInput": {
+            /** @example [{"name": "search", "input": {"query": "weather"}}] */
+            executed_tool_calls: string;
+            /** @example [{"name": "search", "input": {"query": "weather"}}] */
+            expected_tool_calls: string;
+        };
         "request.AnswerCompletenessRequest": {
             input: components["schemas"]["request.AnswerCompletenessInput"];
         };
@@ -133,13 +153,8 @@ export interface components {
             query: string;
         };
         "request.ConversationQualityRequest": {
-            config?: components["schemas"]["request.ConversationQualityConfigRequest"];
             input: components["schemas"]["request.ConversationQualityInput"];
         };
-        "request.ConversationQualityConfigRequest": {
-            /** @example gpt-4o */
-            model?: string;
-        };
         "request.ConversationQualityInput": {
             /** @example ["Hi! I'd be happy to assist you today.", "We offer consulting, development, and support services."] */
             completions: string;
@@ -157,6 +172,15 @@ export interface components {
             /** @example When was the Eiffel Tower built? */
             question: string;
         };
+        "request.HtmlComparisonRequest": {
+            input: components["schemas"]["request.HtmlComparisonInput"];
+        };
+        "request.HtmlComparisonInput": {
+            /** @example <html><body><h1>Hello, world!</h1></body></html> */
+            html1: string;
+            /** @example <html><body><h1>Hello, world!</h1></body></html> */
+            html2: string;
+        };
         "request.InstructionAdherenceRequest": {
             input: components["schemas"]["request.InstructionAdherenceInput"];
         };
@@ -171,13 +195,8 @@ export interface components {
             response: string;
         };
         "request.IntentChangeRequest": {
-            config?: components["schemas"]["request.IntentChangeConfigRequest"];
             input: components["schemas"]["request.IntentChangeInput"];
         };
-        "request.IntentChangeConfigRequest": {
-            /** @example gpt-4o */
-            model?: string;
-        };
         "request.IntentChangeInput": {
             /** @example ["Sure, I can help with hotel booking", "No problem, let me search for flights"] */
             completions: string;