apify-mcp-server/evals/workflows/workflow_judge.ts at 7aa69c7a94d40475f0fbb768292d2c68886b9018 · apify/apify-mcp-server · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/**
 * LLM Judge for evaluating conversation quality
 * Uses structured output (JSON schema) for robust parsing
 */

// eslint-disable-next-line import/extensions
import type { ResponseFormatJSONSchema } from 'openai/resources/shared';

import type { WorkflowTestCase } from '../shared/types.js';
import { JUDGE_PROMPT_TEMPLATE, MODELS } from './config.js';
import type { LlmClient } from './llm_client.js';
import type { ConversationHistory } from './types.js';

/**
 * Judge evaluation result
 */
export type JudgeResult = {
    /** PASS or FAIL verdict */
    verdict: 'PASS' | 'FAIL';
    /** Explanation from judge */
    reason: string;
    /** Raw response from judge (for debugging) */
    rawResponse: string;
}

/**
 * JSON schema for structured judge output
 * Guarantees the LLM returns valid JSON matching this schema
 */
const JUDGE_RESPONSE_SCHEMA: ResponseFormatJSONSchema = {
    type: 'json_schema',
    json_schema: {
        name: 'judge_evaluation',
        strict: true,
        schema: {
            type: 'object',
            properties: {
                verdict: {
                    type: 'string',
                    enum: ['PASS', 'FAIL'],
                    description: 'Whether the agent passed or failed the evaluation',
                },
                reason: {
                    type: 'string',
                    description: 'Brief explanation in 1-2 sentences explaining why the agent passed or failed',
                },
            },
            required: ['verdict', 'reason'],
            additionalProperties: false,
        },
    },
};

/**
 * Format conversation for judge evaluation
 * Judge sees: tool calls + arguments + final responses (NOT tool results)
 */
function formatConversationForJudge(conversation: ConversationHistory): string {
    const lines: string[] = [];

    // User prompt
    lines.push(`USER: ${conversation.userPrompt}`);
    lines.push('');

    // Each turn
    for (const turn of conversation.turns) {
        // Show tool calls (if any)
        if (turn.toolCalls.length > 0) {
            for (const toolCall of turn.toolCalls) {
                lines.push(`AGENT: [Called tool: ${toolCall.name} with args: ${JSON.stringify(toolCall.arguments)}]`);
            }
        }

        // Show final response (if present)
        if (turn.finalResponse) {
            lines.push(`AGENT: ${turn.finalResponse}`);
        }

        lines.push('');
    }

    return lines.join('\n').trim();
}

/**
 * Parse structured JSON response from judge
 */
function parseJudgeResponse(response: string): { verdict: 'PASS' | 'FAIL'; reason: string } {
    try {
        const parsed = JSON.parse(response) as { verdict: 'PASS' | 'FAIL'; reason: string };

        // Validate the structure (should be guaranteed by schema, but double-check)
        if (!parsed.verdict || (parsed.verdict !== 'PASS' && parsed.verdict !== 'FAIL')) {
            throw new Error(`Invalid verdict: ${parsed.verdict}`);
        }

        if (!parsed.reason || typeof parsed.reason !== 'string') {
            throw new Error(`Invalid reason: ${parsed.reason}`);
        }

        return parsed;
    } catch (error) {
        throw new Error(
            `Failed to parse judge JSON response: ${error instanceof Error ? error.message : String(error)}\n`
            + `Raw response: ${response}`,
        );
    }
}

/**
 * Evaluate a conversation using the judge LLM
 */
export async function evaluateConversation(
    testCase: WorkflowTestCase,
    conversation: ConversationHistory,
    llmClient: LlmClient,
    judgeModel: string = MODELS.judge,
): Promise<JudgeResult> {
    // Format conversation for judge
    const formattedConversation = formatConversationForJudge(conversation);

    // Create judge prompt using reference field
    const judgePrompt = JUDGE_PROMPT_TEMPLATE
        .replace('{{reference}}', testCase.reference || '')
        .replace('{{conversation}}', formattedConversation);

    // Call judge LLM with structured output schema
    const response = await llmClient.callLlm(
        [{ role: 'user', content: judgePrompt }],
        judgeModel,
        undefined, // No tools
        JUDGE_RESPONSE_SCHEMA, // Use structured output
    );

    const rawResponse = response.content || '';

    // Parse response
    try {
        const { verdict, reason } = parseJudgeResponse(rawResponse);
        return {
            verdict,
            reason,
            rawResponse,
        };
    } catch (error) {
        throw new Error(
            `Failed to parse judge response: ${error instanceof Error ? error.message : String(error)}\n`
            + `Raw response: ${rawResponse}`,
        );
    }
}