get-started-ai-agents-demo/evals/evaluate.py at main · howieleung/get-started-ai-agents-demo · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import time
import json

from pathlib import Path
from dotenv import load_dotenv
from urllib.parse import urlparse

from azure.ai.agents.models import RunStatus, MessageRole
from azure.ai.projects import AIProjectClient
from azure.ai.evaluation import (
    AIAgentConverter, evaluate, ToolCallAccuracyEvaluator, IntentResolutionEvaluator,
    TaskAdherenceEvaluator, CodeVulnerabilityEvaluator, ContentSafetyEvaluator,
    IndirectAttackEvaluator)

from azure.identity import DefaultAzureCredential

def run_evaluation():
    """Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
    current_dir = Path(__file__).parent
    eval_queries_path = current_dir / "eval-queries.json"
    eval_input_path = current_dir / f"eval-input.jsonl"
    eval_output_path = current_dir / f"eval-output.json"

    env_path = current_dir / "../src/.env"
    load_dotenv(dotenv_path=env_path)

    # Get AI project parameters from environment variables
    project_endpoint = os.environ.get("AZURE_EXISTING_AIPROJECT_ENDPOINT")
    parsed_endpoint = urlparse(project_endpoint)
    model_endpoint = f"{parsed_endpoint.scheme}://{parsed_endpoint.netloc}"
    deployment_name = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME")
    agent_name = os.environ.get("AZURE_AI_AGENT_NAME")
    agent_id = os.environ.get("AZURE_EXISTING_AGENT_ID")

    # Validate required environment variables
    if not project_endpoint:
        raise ValueError("Please set the AZURE_EXISTING_AIPROJECT_ENDPOINT environment variable.")

    if not agent_id and not agent_name:
        raise ValueError("Please set either AZURE_EXISTING_AGENT_ID or AZURE_AI_AGENT_NAME environment variable.")

    # Initialize the AIProjectClient
    credential = DefaultAzureCredential()
    ai_project = AIProjectClient(
        credential=credential,
        endpoint=project_endpoint,
        api_version = "2025-05-15-preview" # Evaluations yet not supported on stable (api_version="2025-05-01")
    )

    # Look up the agent by name if agent Id is not provided
    if not agent_id and agent_name:
        for agent in ai_project.agents.list_agents():
            if agent.name == agent_name:
                agent_id = agent.id
                break

    if not agent_id:
        raise ValueError("Agent ID not found. Please provide a valid agent ID or name.")

    agent = ai_project.agents.get_agent(agent_id)

    # Use model from agent if not provided
    if not deployment_name:
        deployment_name = agent.model

    # Setup required evaluation config
    model_config = {
        "azure_deployment": deployment_name,
        "azure_endpoint": model_endpoint,
        "api_version": "",
    }
    thread_data_converter = AIAgentConverter(ai_project)

    # Read test queries from input file
    with open(eval_queries_path, "r", encoding="utf-8") as f:
        test_data = json.load(f)

    # Execute the test queries against the agent and prepare the evaluation input
    with open(eval_input_path, "w", encoding="utf-8") as f:

        for row in test_data:
            # Create a new thread for each query to isolate conversations
            thread = ai_project.agents.threads.create()

            # Create the user query
            ai_project.agents.messages.create(
                thread.id, role=MessageRole.USER, content=row.get("query")
            )

            # Run agent on thread and measure performance
            start_time = time.time()
            run = ai_project.agents.runs.create_and_process(
                thread_id=thread.id, agent_id=agent.id
            )
            end_time = time.time()

            if run.status != RunStatus.COMPLETED:
                raise ValueError(run.last_error or "Run failed to complete")

            operational_metrics = {
                "server-run-duration-in-seconds": (
                    run.completed_at - run.created_at
                ).total_seconds(),
                "client-run-duration-in-seconds": end_time - start_time,
                "completion-tokens": run.usage.completion_tokens,
                "prompt-tokens": run.usage.prompt_tokens,
                "ground-truth": row.get("ground-truth", '')
            }

            # Add thread data + operational metrics to the evaluation input
            evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id)
            eval_item = evaluation_data[0]
            eval_item["metrics"] = operational_metrics
            f.write(json.dumps(eval_item) + "\n")


    # Now, run a sample set of evaluators using the evaluation input
    # See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
    # for the full list of evaluators available.
    results = evaluate(
        evaluation_name="evaluation-test",
        data=eval_input_path,
        evaluators={
            "operational_metrics": OperationalMetricsEvaluator(),
            "tool_call_accuracy": ToolCallAccuracyEvaluator(model_config=model_config),
            "intent_resolution": IntentResolutionEvaluator(model_config=model_config),
            "task_adherence": TaskAdherenceEvaluator(model_config=model_config),
            "code_vulnerability": CodeVulnerabilityEvaluator(credential=credential, azure_ai_project=project_endpoint),
            "content_safety": ContentSafetyEvaluator(credential=credential, azure_ai_project=project_endpoint),
            "indirect_attack": IndirectAttackEvaluator(credential=credential, azure_ai_project=project_endpoint)
        },
        output_path=eval_output_path, # raw evaluation results
        azure_ai_project=project_endpoint, # if you want results uploaded to AI Foundry
    )

    # Format and print the evaluation results
    print_eval_results(results, eval_input_path, eval_output_path)


class OperationalMetricsEvaluator:
    """Propagate operational metrics to the final evaluation results"""
    def __init__(self):
        pass
    def __call__(self, *, metrics: dict, **kwargs):
        return metrics


def print_eval_results(results, input_path, output_path):
    """Print the evaluation results in a formatted table"""
    metrics = results.get("metrics", {})

    # Get the maximum length for formatting
    key_len = max(len(key) for key in metrics.keys()) + 5
    value_len = 20
    full_len = key_len + value_len + 5

    # Format the header
    print("\n" + "=" * full_len)
    print("Evaluation Results".center(full_len))
    print("=" * full_len)

    # Print all metrics, see evaluation output file for full details
    print(f"{'Metric':<{key_len}} | {'Value'}")
    print("-" * (key_len) + "-+-" + "-" * value_len)

    for key, value in sorted(metrics.items()):
        if isinstance(value, float):
            formatted_value = f"{value:.2f}"
        else:
            formatted_value = str(value)

        print(f"{key:<{key_len}} | {formatted_value}")

    print("=" * full_len + "\n")

    # Print additional information
    print(f"Evaluation input: {input_path}")
    print(f"Evaluation output: {output_path}")
    if results.get("studio_url") is not None:
        print(f"AI Foundry URL: {results['studio_url']}")

    print("\n" + "=" * full_len + "\n")


if __name__ == "__main__":
    try:
        run_evaluation()
    except Exception as e:
        print(f"Error during evaluation: {e}")