Azure · w-javed · Mar 12, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
@@ -25,6 +25,7 @@
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
+from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -64,5 +65,6 @@
     "EvaluatorConfig",
     "Conversation",
     "Message",
-    "EvaluationResult"
+    "EvaluationResult",
+    "CodeVulnerabilityEvaluator",
 ]
@@ -39,6 +39,7 @@ class Tasks:
     PROTECTED_MATERIAL = "protected material"
     XPIA = "xpia"
     GROUNDEDNESS = "groundedness"
+    CODE_VULNERABILITY = "code vulnerability"
 
 
 class _InternalAnnotationTasks:
@@ -61,6 +62,7 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     PROTECTED_MATERIAL = "protected_material"
     XPIA = "xpia"
     GROUNDEDNESS = "generic_groundedness"
+    CODE_VULNERABILITY = "code_vulnerability"
 
 
 class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):

@@ -64,6 +64,12 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
             "context": data.get("context", ""),
         }
         return json.dumps(as_dict)
+    if annotation_task == Tasks.CODE_VULNERABILITY:
+        as_dict = {
+            "context": data.get("query", ""),
+            "completion": data.get("response", "")
+        }
+        return json.dumps(as_dict)
     as_dict = {
         "query": html.escape(data.get("query", "")),
         "response": html.escape(data.get("response", "")),
@@ -274,6 +280,7 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         EvaluationMetrics.PROTECTED_MATERIAL,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        EvaluationMetrics.CODE_VULNERABILITY,
     }:
         result = {}
         if not batch_response or len(batch_response[0]) == 0:
@@ -313,6 +320,13 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
             result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
+        if metric_name == EvaluationMetrics.CODE_VULNERABILITY:
+            # Add all attributes under the metadata.
+            metadata = {}
+            for key, value in parsed_response.items():
+                if key not in {"label", "reasoning", "version"}:
+                    metadata[key.replace("-", "_")] = value
+            result[metric_display_name + "_metadata"] = metadata
         return result
     return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
 

@@ -26,6 +26,7 @@ class EvaluationMetrics:
     FICTIONAL_CHARACTERS = "fictional_characters"
     LOGOS_AND_BRANDS = "logos_and_brands"
     XPIA = "xpia"
+    CODE_VULNERABILITY = "code_vulnerability"
 
 
 class _InternalEvaluationMetrics:

@@ -152,6 +152,7 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         EvaluationMetrics.LOGOS_AND_BRANDS,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        EvaluationMetrics.CODE_VULNERABILITY,
     ]
     label_cols = []
     for col in df.columns:

@@ -0,0 +1,5 @@
+from ._code_vulnerability import CodeVulnerabilityEvaluator
+
+__all__ = [
+    "CodeVulnerabilityEvaluator",
+]
@@ -0,0 +1,120 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing_extensions import overload, override
+from typing import Union
+
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+
+
+@experimental
+class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+    """
+    Evaluates service-based code vulnerability for a given query and response for a single-turn evaluation only, 
+    where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
+
+    The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
+
+    - Python
+    - Java
+    - C++
+    - C#
+    - Go
+    - Javascript
+    - SQL
+
+    The code vulnerability evaluation identifies the following vulnerabilities:
+
+    - path-injection
+    - sql-injection
+    - code-injection
+    - stack-trace-exposure
+    - incomplete-url-substring-sanitization
+    - flask-debug
+    - clear-text-logging-sensitive-data
+    - incomplete-hostname-regexp
+    - server-side-unvalidated-url-redirection
+    - weak-cryptographic-algorithm
+    - full-ssrf
+    - bind-socket-all-network-interfaces
+    - client-side-unvalidated-url-redirection
+    - likely-bugs
+    - reflected-xss
+    - clear-text-storage-sensitive-data
+    - tarslip
+    - hardcoded-credentials
+    - insecure-randomness
+
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+
+    .. admonition:: Example:
+
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START code_vulnerability_evaluator]
+            :end-before: [END code_vulnerability_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
+
+    .. note::
+
+        If this evaluator is supplied to the `evaluate` function, the metric
+        for the code vulnerability will be "code_vulnerability_label".
+    """
+
+    id = "code_vulnerability"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+    ):
+        super().__init__(
+            eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+        )
+
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ): 
+        """Evaluate a given query/response pair for code vulnerability
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The code vulnerability label.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only. 
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+        return super().__call__(*args, **kwargs)
@@ -168,4 +168,6 @@ def _get_task(self):
             return _InternalAnnotationTasks.ECI
         if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
             return Tasks.PROTECTED_MATERIAL
+        if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
+            return Tasks.CODE_VULNERABILITY
         return Tasks.CONTENT_HARM
@@ -28,6 +28,7 @@ class AdversarialScenario(Enum):
     ADVERSARIAL_CONTENT_GEN_UNGROUNDED = "adv_content_gen_ungrounded"
     ADVERSARIAL_CONTENT_GEN_GROUNDED = "adv_content_gen_grounded"
     ADVERSARIAL_CONTENT_PROTECTED_MATERIAL = "adv_content_protected_material"
+    ADVERSARIAL_CODE_VULNERABILITY = "adv_code_vuln"
 
 
 @experimental

@@ -100,6 +100,105 @@ async def callback(
         assert "topic" not in outputs[0]["template_parameters"]
         assert "target_population" not in outputs[0]["template_parameters"]
 
+    def test_adv_qa_sim_responds_with_one_response(self, azure_cred, project_scope):
+        os.environ.pop("RAI_SVC_URL", None)
+        from azure.ai.evaluation.simulator import AdversarialScenario, AdversarialSimulator
+
+        azure_ai_project = {
+            "subscription_id": project_scope["subscription_id"],
+            "resource_group_name": project_scope["resource_group_name"],
+            "project_name": project_scope["project_name"],
+        }
+
+        async def callback(
+            messages: List[Dict],
+            stream: bool = False,
+            session_state: Any = None,
+            context: Optional[Dict[str, Any]] = None,
+        ) -> dict:
+            query = messages["messages"][0]["content"]
+            response_from_acs, temperature = query, 0.0
+            formatted_response = {
+                "content": response_from_acs["result"],
+                "role": "assistant",
+                "context": {
+                    "temperature": temperature,
+                },
+            }
+            messages["messages"].append(formatted_response)
+            return {
+                "messages": messages["messages"],
+                "stream": stream,
+                "session_state": session_state,
+                "context": context,
+            }
+
+        simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred)
+
+        outputs = asyncio.run(
+            simulator(
+                scenario=AdversarialScenario.ADVERSARIAL_QA,
+                max_conversation_turns=1,
+                max_simulation_results=1,
+                target=callback,
+                api_call_retry_limit=3,
+                api_call_retry_sleep_sec=1,
+                api_call_delay_sec=30,
+                concurrent_async_task=1,
+            )
+        )
+        assert len(outputs) == 1
+        # assert topic and target_population is not present in outpts[0]["scenario_parameters"]
+        assert "topic" not in outputs[0]["template_parameters"]
+        assert "target_population" not in outputs[0]["template_parameters"]
+
+    def test_adv_code_vuln_sim_responds_with_one_response(self, azure_cred, project_scope):
+        os.environ.pop("RAI_SVC_URL", None)
+        from azure.ai.evaluation.simulator import AdversarialScenario, AdversarialSimulator
+
+        azure_ai_project = {
+            "subscription_id": project_scope["subscription_id"],
+            "resource_group_name": project_scope["resource_group_name"],
+            "project_name": project_scope["project_name"],
+        }
+
+        async def callback(
+            messages: List[Dict],
+            stream: bool = False,
+            session_state: Any = None,
+            context: Optional[Dict[str, Any]] = None,
+        ) -> dict:
+            query = messages["messages"][0]["content"]
+            response_from_llm = "SELECT * FROM users WHERE username = {user_input};" 
+            temperature = 0.0
+            formatted_response = {
+                "content": response_from_llm,
+                "role": "assistant",
+                "context": {
+                    "temperature": temperature,
+                },
+            }
+            messages["messages"].append(formatted_response)
+            return {
+                "messages": messages["messages"],
+                "stream": stream,
+                "session_state": session_state,
+                "context": context,
+            }
+
+        simulator = AdversarialSimulator(azure_ai_project=azure_ai_project, credential=azure_cred)
+
+        outputs = asyncio.run(
+            simulator(
+                scenario=AdversarialScenario.ADVERSARIAL_CODE_VULNERABILITY,
+                max_conversation_turns=1,
+                max_simulation_results=1,
+                target=callback,
+            )
+        )
+        assert len(outputs) == 1
+        assert outputs[0]["messages"][1]["content"] == "SELECT * FROM users WHERE username = {user_input};"
+
     @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr")
     def test_adv_conversation_sim_responds_with_responses(self, azure_cred, project_scope):
         os.environ.pop("RAI_SVC_URL", None)

@@ -38,6 +38,7 @@
     ViolenceEvaluator,
     RetrievalEvaluator,
     GroundednessProEvaluator,
+    CodeVulnerabilityEvaluator,
 )
 from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator
 
@@ -238,6 +239,45 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp
         assert len(score2["evaluation_per_turn"]["violence"]) == 2
         assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty."
 
+    def test_code_vulnerability_evaluator(self, project_scope, azure_cred):
+        code_vuln_eval = CodeVulnerabilityEvaluator(azure_cred, project_scope)
+
+        # the code that comes before the completion.
+        context = "{\n\t\t\t\t\toptimize"
+        # the code that is recommended by the assistant.
+        completion = "dKeys = false;\n"
+
+        code_vulnerability_result = code_vuln_eval(
+            query=context,
+            response=completion
+        )
+        assert code_vulnerability_result is not None
+        assert "code_vulnerability_label" in code_vulnerability_result and code_vulnerability_result["code_vulnerability_label"] is False
+        assert "code_vulnerability_reason" in code_vulnerability_result
+        assert "code_vulnerability_metadata" in code_vulnerability_result
+
+        metadata = code_vulnerability_result["code_vulnerability_metadata"]
+
+        assert "code_injection" in metadata and metadata["code_injection"] is False
+        assert "full_ssrf" in metadata and metadata["full_ssrf"] is False
+        assert "path_injection" in metadata and metadata["path_injection"] is False
+        assert "hardcoded_credentials" in metadata and metadata["hardcoded_credentials"] is False
+        assert "stack_trace_exposure" in metadata and metadata["stack_trace_exposure"] is False
+        assert "incomplete_url_substring_sanitization" in metadata and metadata["incomplete_url_substring_sanitization"] is False
+        assert "flask_debug" in metadata and metadata["flask_debug"] is False
+        assert "potentially_weak_cryptographic_algorithm" in metadata and metadata["potentially_weak_cryptographic_algorithm"] is False
+        assert "clear_text_logging_sensitive_data" in metadata and metadata["clear_text_logging_sensitive_data"] is False
+        assert "incomplete_hostname_regexp" in metadata and metadata["incomplete_hostname_regexp"] is False
+        assert "sql_injection" in metadata and metadata["sql_injection"] is False
+        assert "insecure_randomness" in metadata and metadata["insecure_randomness"] is False
+        assert "bind_socket_all_network_interfaces" in metadata and metadata["bind_socket_all_network_interfaces"] is False
+        assert "client_side_unvalidated_url_redirection" in metadata and metadata["client_side_unvalidated_url_redirection"] is False
+        assert "likely_bugs" in metadata and metadata["likely_bugs"] is False
+        assert "server_side_unvalidated_url_redirection" in metadata and metadata["server_side_unvalidated_url_redirection"] is False
+        assert "clear_text_storage_sensitive_data" in metadata and metadata["clear_text_storage_sensitive_data"] is False
+        assert "tarslip" in metadata and metadata["tarslip"] is False
+        assert "reflected_xss" in metadata and metadata["reflected_xss"] is False
+
     def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation):
         eval_fn = SexualEvaluator(azure_cred, project_scope)
         score = eval_fn(

@@ -10,7 +10,6 @@
 from azure.ai.evaluation import (
     F1ScoreEvaluator,
     FluencyEvaluator,
-    GroundednessEvaluator,
     evaluate,
 )
 from azure.ai.evaluation._common.math import list_mean_nan_safe