-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
191 lines (151 loc) · 7.25 KB
/
evaluate.py
File metadata and controls
191 lines (151 loc) · 7.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import os
import time
import json
from pathlib import Path
from dotenv import load_dotenv
from urllib.parse import urlparse
from azure.ai.agents.models import RunStatus, MessageRole
from azure.ai.projects import AIProjectClient
from azure.ai.evaluation import (
AIAgentConverter, evaluate, ToolCallAccuracyEvaluator, IntentResolutionEvaluator,
TaskAdherenceEvaluator, CodeVulnerabilityEvaluator, ContentSafetyEvaluator,
IndirectAttackEvaluator)
from azure.identity import DefaultAzureCredential
def run_evaluation():
"""Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
current_dir = Path(__file__).parent
eval_queries_path = current_dir / "eval-queries.json"
eval_input_path = current_dir / f"eval-input.jsonl"
eval_output_path = current_dir / f"eval-output.json"
env_path = current_dir / "../src/.env"
load_dotenv(dotenv_path=env_path)
# Get AI project parameters from environment variables
project_endpoint = os.environ.get("AZURE_EXISTING_AIPROJECT_ENDPOINT")
parsed_endpoint = urlparse(project_endpoint)
model_endpoint = f"{parsed_endpoint.scheme}://{parsed_endpoint.netloc}"
deployment_name = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME")
agent_name = os.environ.get("AZURE_AI_AGENT_NAME")
agent_id = os.environ.get("AZURE_EXISTING_AGENT_ID")
# Validate required environment variables
if not project_endpoint:
raise ValueError("Please set the AZURE_EXISTING_AIPROJECT_ENDPOINT environment variable.")
if not agent_id and not agent_name:
raise ValueError("Please set either AZURE_EXISTING_AGENT_ID or AZURE_AI_AGENT_NAME environment variable.")
# Initialize the AIProjectClient
credential = DefaultAzureCredential()
ai_project = AIProjectClient(
credential=credential,
endpoint=project_endpoint,
api_version = "2025-05-15-preview" # Evaluations yet not supported on stable (api_version="2025-05-01")
)
# Look up the agent by name if agent Id is not provided
if not agent_id and agent_name:
for agent in ai_project.agents.list_agents():
if agent.name == agent_name:
agent_id = agent.id
break
if not agent_id:
raise ValueError("Agent ID not found. Please provide a valid agent ID or name.")
agent = ai_project.agents.get_agent(agent_id)
# Use model from agent if not provided
if not deployment_name:
deployment_name = agent.model
# Setup required evaluation config
model_config = {
"azure_deployment": deployment_name,
"azure_endpoint": model_endpoint,
"api_version": "",
}
thread_data_converter = AIAgentConverter(ai_project)
# Read test queries from input file
with open(eval_queries_path, "r", encoding="utf-8") as f:
test_data = json.load(f)
# Execute the test queries against the agent and prepare the evaluation input
with open(eval_input_path, "w", encoding="utf-8") as f:
for row in test_data:
# Create a new thread for each query to isolate conversations
thread = ai_project.agents.threads.create()
# Create the user query
ai_project.agents.messages.create(
thread.id, role=MessageRole.USER, content=row.get("query")
)
# Run agent on thread and measure performance
start_time = time.time()
run = ai_project.agents.runs.create_and_process(
thread_id=thread.id, agent_id=agent.id
)
end_time = time.time()
if run.status != RunStatus.COMPLETED:
raise ValueError(run.last_error or "Run failed to complete")
operational_metrics = {
"server-run-duration-in-seconds": (
run.completed_at - run.created_at
).total_seconds(),
"client-run-duration-in-seconds": end_time - start_time,
"completion-tokens": run.usage.completion_tokens,
"prompt-tokens": run.usage.prompt_tokens,
"ground-truth": row.get("ground-truth", '')
}
# Add thread data + operational metrics to the evaluation input
evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id)
eval_item = evaluation_data[0]
eval_item["metrics"] = operational_metrics
f.write(json.dumps(eval_item) + "\n")
# Now, run a sample set of evaluators using the evaluation input
# See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
# for the full list of evaluators available.
results = evaluate(
evaluation_name="evaluation-test",
data=eval_input_path,
evaluators={
"operational_metrics": OperationalMetricsEvaluator(),
"tool_call_accuracy": ToolCallAccuracyEvaluator(model_config=model_config),
"intent_resolution": IntentResolutionEvaluator(model_config=model_config),
"task_adherence": TaskAdherenceEvaluator(model_config=model_config),
"code_vulnerability": CodeVulnerabilityEvaluator(credential=credential, azure_ai_project=project_endpoint),
"content_safety": ContentSafetyEvaluator(credential=credential, azure_ai_project=project_endpoint),
"indirect_attack": IndirectAttackEvaluator(credential=credential, azure_ai_project=project_endpoint)
},
output_path=eval_output_path, # raw evaluation results
azure_ai_project=project_endpoint, # if you want results uploaded to AI Foundry
)
# Format and print the evaluation results
print_eval_results(results, eval_input_path, eval_output_path)
class OperationalMetricsEvaluator:
"""Propagate operational metrics to the final evaluation results"""
def __init__(self):
pass
def __call__(self, *, metrics: dict, **kwargs):
return metrics
def print_eval_results(results, input_path, output_path):
"""Print the evaluation results in a formatted table"""
metrics = results.get("metrics", {})
# Get the maximum length for formatting
key_len = max(len(key) for key in metrics.keys()) + 5
value_len = 20
full_len = key_len + value_len + 5
# Format the header
print("\n" + "=" * full_len)
print("Evaluation Results".center(full_len))
print("=" * full_len)
# Print all metrics, see evaluation output file for full details
print(f"{'Metric':<{key_len}} | {'Value'}")
print("-" * (key_len) + "-+-" + "-" * value_len)
for key, value in sorted(metrics.items()):
if isinstance(value, float):
formatted_value = f"{value:.2f}"
else:
formatted_value = str(value)
print(f"{key:<{key_len}} | {formatted_value}")
print("=" * full_len + "\n")
# Print additional information
print(f"Evaluation input: {input_path}")
print(f"Evaluation output: {output_path}")
if results.get("studio_url") is not None:
print(f"AI Foundry URL: {results['studio_url']}")
print("\n" + "=" * full_len + "\n")
if __name__ == "__main__":
try:
run_evaluation()
except Exception as e:
print(f"Error during evaluation: {e}")