Skip to content

Commit a6bd4be

Browse files
authored
refactor(traceloop-sdk): add made by Traceloop evals (#853)
1 parent f480743 commit a6bd4be

File tree

16 files changed

+2285
-32
lines changed

16 files changed

+2285
-32
lines changed

.prettierignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
pnpm-lock.yaml
2-
pnpm-workspace.yaml
2+
pnpm-workspace.yaml
3+
**/generated/**

package.json

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,15 @@
44
"license": "Apache-2.0",
55
"scripts": {
66
"build:all": "pnpm nx run-many -t build",
7-
"build:affected": "pnpm nx affected -t build"
7+
"build:affected": "pnpm nx affected -t build",
8+
"generate:evaluator-models": "./scripts/generate-models.sh"
89
},
910
"private": true,
1011
"devDependencies": {
12+
"@apidevtools/swagger-parser": "^10.1.0",
13+
"@types/node": "^24.0.15",
14+
"openapi-typescript": "^7.4.0",
15+
"ts-node": "^10.9.2",
1116
"@commitlint/cli": "^19.8.1",
1217
"@commitlint/config-conventional": "^19.8.1",
1318
"@eslint/eslintrc": "^3.3.1",

packages/sample-app/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"run:sample_generate": "npm run build && node dist/src/test_generate_only.js",
4141
"run:sample_experiment": "npm run build && node dist/src/sample_experiment.js",
4242
"run:github_experiment": "npm run build && node dist/src/sample_github_experiment.js",
43+
"run:security_experiment": "npm run build && node dist/src/sample_security_experiment.js",
4344
"run:mcp": "npm run build && node dist/src/sample_mcp.js",
4445
"run:mcp:real": "npm run build && node dist/src/sample_mcp_real.js",
4546
"run:mcp:working": "npm run build && node dist/src/sample_mcp_working.js",
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/**
2+
* Security Evaluators Experiment
3+
*
4+
* This example demonstrates Traceloop's security evaluators:
5+
* - PII Detector: Identifies personal information exposure
6+
* - Secrets Detector: Monitors for credential and key leaks
7+
* - Prompt Injection: Detects prompt injection attempts
8+
*
9+
* These evaluators help ensure your AI applications don't leak sensitive data
10+
* or fall victim to prompt injection attacks.
11+
*/
12+
13+
import OpenAI from "openai";
14+
import {
15+
TraceloopClient,
16+
EvaluatorMadeByTraceloop,
17+
type TaskInput,
18+
type TaskOutput,
19+
type TaskResponse,
20+
} from "@traceloop/node-server-sdk";
21+
22+
// Initialize OpenAI client
23+
const openai = new OpenAI({
24+
apiKey: process.env.OPENAI_API_KEY,
25+
});
26+
27+
// Initialize Traceloop client
28+
const traceloop = new TraceloopClient({
29+
apiKey: process.env.TRACELOOP_API_KEY!,
30+
appName: "security-evaluators-sample",
31+
baseUrl: process.env.TRACELOOP_API_ENDPOINT,
32+
});
33+
34+
/**
35+
* Generate a response using OpenAI
36+
*/
37+
async function generateResponse(prompt: string): Promise<string> {
38+
const response = await openai.chat.completions.create({
39+
model: "gpt-3.5-turbo",
40+
messages: [{ role: "user", content: prompt }],
41+
temperature: 0.7,
42+
max_tokens: 200,
43+
});
44+
45+
return response.choices[0].message.content || "";
46+
}
47+
48+
/**
49+
* Task function that processes user queries.
50+
* Returns text that will be evaluated for security issues.
51+
*/
52+
async function securityTask(row: TaskInput): Promise<TaskOutput> {
53+
const userQuery = (row.query as string) || "";
54+
55+
// Generate response
56+
const response = await generateResponse(userQuery);
57+
58+
// Return data for evaluation
59+
return {
60+
text: response, // The text to check for PII, secrets, and prompt injection
61+
prompt: userQuery, // Required for prompt injection detector
62+
};
63+
}
64+
65+
/**
66+
* Run experiment with security evaluators.
67+
*
68+
* This experiment will evaluate responses for:
69+
* 1. PII (Personal Identifiable Information)
70+
* 2. Secrets (API keys, passwords, tokens)
71+
* 3. Prompt Injection attempts
72+
*/
73+
async function runSecurityExperiment(): Promise<void> {
74+
console.log("\n" + "=".repeat(80));
75+
console.log("SECURITY EVALUATORS EXPERIMENT");
76+
console.log("=".repeat(80) + "\n");
77+
78+
console.log(
79+
"This experiment will test three critical security evaluators:\n",
80+
);
81+
console.log(
82+
"1. PII Detector - Identifies personal information (names, emails, SSN, etc.)",
83+
);
84+
console.log(
85+
"2. Secrets Detector - Finds API keys, passwords, and credentials",
86+
);
87+
console.log(
88+
"3. Prompt Injection - Detects attempts to manipulate the AI system",
89+
);
90+
console.log("\n" + "-".repeat(80) + "\n");
91+
92+
// Configure security evaluators using the generated factory methods
93+
const evaluators = [
94+
EvaluatorMadeByTraceloop.piiDetector({ probability_threshold: 0.7 }),
95+
EvaluatorMadeByTraceloop.secretsDetector(),
96+
EvaluatorMadeByTraceloop.promptInjection({ threshold: 0.6 }),
97+
];
98+
99+
console.log("Configured evaluators:");
100+
evaluators.forEach((e) => console.log(` - ${e.name}`));
101+
console.log("\n" + "-".repeat(80) + "\n");
102+
103+
// Run the experiment
104+
const result = await traceloop.experiment.run(securityTask, {
105+
datasetSlug: "security", // Set a dataset slug that exists in the traceloop platform
106+
datasetVersion: "v1",
107+
evaluators,
108+
experimentSlug: "security-evaluators-exp",
109+
stopOnError: false,
110+
waitForResults: true,
111+
});
112+
113+
// Type guard: check if this is a local run result (ExperimentRunResult)
114+
if ("taskResults" in result) {
115+
const { taskResults, errors, experimentId, runId } = result;
116+
117+
console.log("\n" + "=".repeat(80));
118+
console.log("Security experiment completed!");
119+
console.log(`Experiment ID: ${experimentId}`);
120+
console.log(`Run ID: ${runId}`);
121+
console.log(
122+
`Task Results: ${taskResults.length}, Errors: ${errors.length}`,
123+
);
124+
console.log("=".repeat(80) + "\n");
125+
126+
// Print results summary
127+
if (taskResults.length > 0) {
128+
console.log("Results summary:");
129+
taskResults.forEach((taskResult: TaskResponse, idx: number) => {
130+
console.log(`\nTask ${idx + 1}:`);
131+
console.log(
132+
` Input: ${JSON.stringify(taskResult.input).substring(0, 100)}...`,
133+
);
134+
console.log(
135+
` Output: ${JSON.stringify(taskResult.output || {}).substring(0, 100)}...`,
136+
);
137+
if (taskResult.error) {
138+
console.log(` Error: ${taskResult.error}`);
139+
}
140+
});
141+
}
142+
143+
if (errors.length > 0) {
144+
console.log("\nErrors:");
145+
errors.forEach((error: string, idx: number) => {
146+
console.log(` ${idx + 1}. ${error}`);
147+
});
148+
}
149+
} else {
150+
// GitHub Actions result
151+
console.log("\n" + "=".repeat(80));
152+
console.log("Experiment submitted to GitHub Actions!");
153+
console.log(`Experiment ID: ${result.experimentId}`);
154+
console.log(`Experiment Slug: ${result.experimentSlug}`);
155+
console.log(`Run ID: ${result.runId}`);
156+
console.log("=".repeat(80) + "\n");
157+
}
158+
}
159+
160+
// Main entry point
161+
async function main(): Promise<void> {
162+
console.log("\nSecurity Evaluators Experiment\n");
163+
164+
try {
165+
await runSecurityExperiment();
166+
} catch (error) {
167+
console.error("Experiment failed:", error);
168+
process.exit(1);
169+
}
170+
}
171+
172+
main();

packages/traceloop-sdk/src/lib/client/evaluator/evaluator.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,26 +66,36 @@ export class Evaluator extends BaseDatasetEntity {
6666
throw new Error("experimentId, evaluator, and taskResult are required");
6767
}
6868

69-
// Handle both string and object evaluator types
69+
// Handle string, EvaluatorWithVersion, and EvaluatorWithConfig types
7070
const evaluatorName =
7171
typeof evaluator === "string" ? evaluator : evaluator.name;
7272
const evaluatorVersion =
7373
typeof evaluator === "string" ? undefined : evaluator.version;
74+
// Extract config if present (EvaluatorWithConfig type)
75+
const evaluatorConfig =
76+
typeof evaluator === "object" && "config" in evaluator
77+
? evaluator.config
78+
: undefined;
7479

7580
if (!evaluatorName) {
7681
throw new Error("evaluator name is required");
7782
}
7883

7984
const inputSchemaMapping = this.createInputSchemaMapping(taskResult);
8085

81-
const payload = {
86+
const payload: Record<string, unknown> = {
8287
experiment_id: experimentId,
8388
experiment_run_id: experimentRunId,
8489
evaluator_version: evaluatorVersion,
8590
task_id: taskId,
8691
input_schema_mapping: inputSchemaMapping,
8792
};
8893

94+
// Add evaluator config if present
95+
if (evaluatorConfig && Object.keys(evaluatorConfig).length > 0) {
96+
payload.evaluator_config = evaluatorConfig;
97+
}
98+
8999
const response = await this.client.post(
90100
`/v2/evaluators/slug/${evaluatorName}/execute`,
91101
payload,

packages/traceloop-sdk/src/lib/client/experiment/experiment.ts

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -329,15 +329,6 @@ export class Experiment {
329329
`Evaluator at index ${index} must have a valid non-empty name`,
330330
);
331331
}
332-
if (
333-
!evaluator.version ||
334-
typeof evaluator.version !== "string" ||
335-
!evaluator.version.trim()
336-
) {
337-
throw new Error(
338-
`Evaluator at index ${index} must have a valid non-empty version`,
339-
);
340-
}
341332
}
342333
});
343334
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
// Auto-generated - DO NOT EDIT
2+
// Regenerate with: pnpm generate:evaluator-models
3+
4+
export {
5+
EVALUATOR_SLUGS,
6+
EVALUATOR_SCHEMAS,
7+
getEvaluatorSchema,
8+
isValidEvaluatorSlug,
9+
} from './registry';
10+
11+
export type { EvaluatorSlug, EvaluatorSchema } from './registry';
12+
13+
export {
14+
EvaluatorMadeByTraceloop,
15+
createEvaluator,
16+
validateEvaluatorInput,
17+
getAvailableEvaluatorSlugs,
18+
getEvaluatorSchemaInfo,
19+
} from './mbt-evaluators';
20+
21+
// Re-export config types
22+
export type * from './mbt-evaluators';

0 commit comments

Comments
 (0)