Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
from ._evaluators._similarity import SimilarityEvaluator
from ._evaluators._xpia import IndirectAttackEvaluator
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
from ._model_configurations import (
AzureAIProject,
AzureOpenAIModelConfiguration,
Expand Down Expand Up @@ -64,5 +65,6 @@
"EvaluatorConfig",
"Conversation",
"Message",
"EvaluationResult"
"EvaluationResult",
"CodeVulnerabilityEvaluator",
]
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class Tasks:
PROTECTED_MATERIAL = "protected material"
XPIA = "xpia"
GROUNDEDNESS = "groundedness"
CODE_VULNERABILITY = "code vulnerability"


class _InternalAnnotationTasks:
Expand All @@ -61,6 +62,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"
GROUNDEDNESS = "generic_groundedness"
CODE_VULNERABILITY = "code_vulnerability"


class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
"context": data.get("context", ""),
}
return json.dumps(as_dict)
if annotation_task == Tasks.CODE_VULNERABILITY:
as_dict = {
"context": data.get("query", ""),
"completion": data.get("response", "")
}
return json.dumps(as_dict)
as_dict = {
"query": html.escape(data.get("query", "")),
"response": html.escape(data.get("response", "")),
Expand Down Expand Up @@ -274,6 +280,7 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
EvaluationMetrics.PROTECTED_MATERIAL,
_InternalEvaluationMetrics.ECI,
EvaluationMetrics.XPIA,
EvaluationMetrics.CODE_VULNERABILITY,
}:
result = {}
if not batch_response or len(batch_response[0]) == 0:
Expand Down Expand Up @@ -313,6 +320,13 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result[metric_display_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
)
if metric_name == EvaluationMetrics.CODE_VULNERABILITY:
# Add all attributes under the metadata.
metadata = {}
for key, value in parsed_response.items():
if key not in {"label", "reasoning", "version"}:
metadata[key.replace("-", "_")] = value
result[metric_display_name + "_metadata"] = metadata
return result
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class EvaluationMetrics:
FICTIONAL_CHARACTERS = "fictional_characters"
LOGOS_AND_BRANDS = "logos_and_brands"
XPIA = "xpia"
CODE_VULNERABILITY = "code_vulnerability"


class _InternalEvaluationMetrics:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
EvaluationMetrics.LOGOS_AND_BRANDS,
_InternalEvaluationMetrics.ECI,
EvaluationMetrics.XPIA,
EvaluationMetrics.CODE_VULNERABILITY,
]
label_cols = []
for col in df.columns:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ._code_vulnerability import CodeVulnerabilityEvaluator

__all__ = [
"CodeVulnerabilityEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing_extensions import overload, override
from typing import Union

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase


@experimental
class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
"""
Evaluates service-based code vulnerability for a given query and response for a single-turn evaluation only,
Comment thread
w-javed marked this conversation as resolved.
Outdated
where query represents the user query or code before the completion, and response represents the code recommended by the assistant.

The code vulnerability evaluation checks for vulnerabilities in the following coding languages:

- Python
- Java
- C++
- C#
- Go
- Javascript
- SQL

The code vulnerability evaluation identifies the following vulnerabilities:

- path-injection
- sql-injection
- code-injection
- stack-trace-exposure
- incomplete-url-substring-sanitization
- flask-debug
- clear-text-logging-sensitive-data
- incomplete-hostname-regexp
- server-side-unvalidated-url-redirection
- weak-cryptographic-algorithm
- full-ssrf
- bind-socket-all-network-interfaces
- client-side-unvalidated-url-redirection
- likely-bugs
- reflected-xss
- clear-text-storage-sensitive-data
- tarslip
- hardcoded-credentials
- insecure-randomness

:param credential: The credential for connecting to Azure AI project. Required
:type credential: ~azure.core.credentials.TokenCredential
:param azure_ai_project: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any

.. admonition:: Example:

.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START code_vulnerability_evaluator]
:end-before: [END code_vulnerability_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.

.. note::

If this evaluator is supplied to the `evaluate` function, the metric
for the code vulnerability will be "code_vulnerability_label".
"""

id = "code_vulnerability"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(
self,
credential,
azure_ai_project,
):
super().__init__(
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
azure_ai_project=azure_ai_project,
credential=credential,
)

@overload
def __call__(
self,
*,
query: str,
response: str,
):
"""Evaluate a given query/response pair for code vulnerability

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The code vulnerability label.
:rtype: Dict[str, Union[str, bool]]
"""

@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.

:keyword query: The query to be evaluated.
:paramtype query: Optional[str]
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:rtype: Dict[str, Union[str, bool]]
"""

return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,6 @@ def _get_task(self):
return _InternalAnnotationTasks.ECI
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
return Tasks.PROTECTED_MATERIAL
if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
return Tasks.CODE_VULNERABILITY
return Tasks.CONTENT_HARM
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class AdversarialScenario(Enum):
ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
ADVERSARIAL_CODE_VULNERABILITY = "adv_code_vuln"


@experimental
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,105 @@ async def callback(
assert "topic" not in outputs[0]["template_parameters"]
assert "target_population" not in outputs[0]["template_parameters"]

def test_adv_qa_sim_responds_with_one_response(self, azure_cred, project_scope):
os.environ.pop("RAI_SVC_URL", None)
from azure.ai.evaluation.simulator import AdversarialScenario, AdversarialSimulator

azure_ai_project = {
"subscription_id": project_scope["subscription_id"],
"resource_group_name": project_scope["resource_group_name"],
"project_name": project_scope["project_name"],
}

async def callback(
messages: List[Dict],
stream: bool = False,
session_state: Any = None,
context: Optional[Dict[str, Any]] = None,
) -> dict:
query = messages["messages"][0]["content"]
response_from_acs, temperature = query, 0.0
formatted_response = {
"content": response_from_acs["result"],
"role": "assistant",
"context": {
"temperature": temperature,
},
}
messages["messages"].append(formatted_response)
return {
"messages": messages["messages"],
"stream": stream,
"session_state": session_state,
"context": context,
}

simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred)

outputs = asyncio.run(
simulator(
scenario=AdversarialScenario.ADVERSARIAL_QA,
max_conversation_turns=1,
max_simulation_results=1,
target=callback,
api_call_retry_limit=3,
api_call_retry_sleep_sec=1,
api_call_delay_sec=30,
concurrent_async_task=1,
)
)
assert len(outputs) == 1
# assert topic and target_population is not present in outpts[0]["scenario_parameters"]
assert "topic" not in outputs[0]["template_parameters"]
assert "target_population" not in outputs[0]["template_parameters"]

def test_adv_code_vuln_sim_responds_with_one_response(self, azure_cred, project_scope):
os.environ.pop("RAI_SVC_URL", None)
from azure.ai.evaluation.simulator import AdversarialScenario, AdversarialSimulator

azure_ai_project = {
"subscription_id": project_scope["subscription_id"],
"resource_group_name": project_scope["resource_group_name"],
"project_name": project_scope["project_name"],
}

async def callback(
messages: List[Dict],
stream: bool = False,
session_state: Any = None,
context: Optional[Dict[str, Any]] = None,
) -> dict:
query = messages["messages"][0]["content"]
response_from_llm = "SELECT * FROM users WHERE username = {user_input};"
temperature = 0.0
formatted_response = {
"content": response_from_llm,
"role": "assistant",
"context": {
"temperature": temperature,
},
}
messages["messages"].append(formatted_response)
return {
"messages": messages["messages"],
"stream": stream,
"session_state": session_state,
"context": context,
}

simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred)

outputs = asyncio.run(
simulator(
scenario=AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY,
max_conversation_turns=1,
max_simulation_results=1,
target=callback,
)
)
assert len(outputs) == 1
assert outputs[0]["messages"][1]["content"] == "SELECT * FROM users WHERE username = {user_input};"

@pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
def test_adv_conversation_sim_responds_with_responses(self, azure_cred, project_scope):
os.environ.pop("RAI_SVC_URL", None)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
ViolenceEvaluator,
RetrievalEvaluator,
GroundednessProEvaluator,
CodeVulnerabilityEvaluator,
)
from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator

Expand Down Expand Up @@ -238,6 +239,45 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp
assert len(score2["evaluation_per_turn"]["violence"]) == 2
assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty."

def test_code_vulnerability_evaluator(self, project_scope, azure_cred):
code_vuln_eval = CodeVulnerabilityEvaluator(azure_cred, project_scope)

# the code that comes before the completion.
context = "{\n\t\t\t\t\toptimize"
# the code that is recommended by the assistant.
completion = "dKeys = false;\n"

code_vulnerability_result = code_vuln_eval(
query=context,
response=completion
)
assert code_vulnerability_result is not None
assert "code_vulnerability_label" in code_vulnerability_result and code_vulnerability_result["code_vulnerability_label"] is False
assert "code_vulnerability_reason" in code_vulnerability_result
assert "code_vulnerability_metadata" in code_vulnerability_result

metadata = code_vulnerability_result["code_vulnerability_metadata"]

assert "code_injection" in metadata and metadata["code_injection"] is False
assert "full_ssrf" in metadata and metadata["full_ssrf"] is False
assert "path_injection" in metadata and metadata["path_injection"] is False
assert "hardcoded_credentials" in metadata and metadata["hardcoded_credentials"] is False
assert "stack_trace_exposure" in metadata and metadata["stack_trace_exposure"] is False
assert "incomplete_url_substring_sanitization" in metadata and metadata["incomplete_url_substring_sanitization"] is False
assert "flask_debug" in metadata and metadata["flask_debug"] is False
assert "potentially_weak_cryptographic_algorithm" in metadata and metadata["potentially_weak_cryptographic_algorithm"] is False
assert "clear_text_logging_sensitive_data" in metadata and metadata["clear_text_logging_sensitive_data"] is False
assert "incomplete_hostname_regexp" in metadata and metadata["incomplete_hostname_regexp"] is False
assert "sql_injection" in metadata and metadata["sql_injection"] is False
assert "insecure_randomness" in metadata and metadata["insecure_randomness"] is False
assert "bind_socket_all_network_interfaces" in metadata and metadata["bind_socket_all_network_interfaces"] is False
assert "client_side_unvalidated_url_redirection" in metadata and metadata["client_side_unvalidated_url_redirection"] is False
assert "likely_bugs" in metadata and metadata["likely_bugs"] is False
assert "server_side_unvalidated_url_redirection" in metadata and metadata["server_side_unvalidated_url_redirection"] is False
assert "clear_text_storage_sensitive_data" in metadata and metadata["clear_text_storage_sensitive_data"] is False
assert "tarslip" in metadata and metadata["tarslip"] is False
assert "reflected_xss" in metadata and metadata["reflected_xss"] is False

def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation):
eval_fn = SexualEvaluator(azure_cred, project_scope)
score = eval_fn(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from azure.ai.evaluation import (
F1ScoreEvaluator,
FluencyEvaluator,
GroundednessEvaluator,
evaluate,
)
from azure.ai.evaluation._common.math import list_mean_nan_safe
Expand Down
Loading