Azure · w-javed · Mar 12, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_2840d9d130"
+  "Tag": "python/evaluation/azure-ai-evaluation_83a7766f56"
 }
@@ -25,6 +25,8 @@
 from ._evaluators._rouge import RougeScoreEvaluator, RougeType
 from ._evaluators._similarity import SimilarityEvaluator
 from ._evaluators._xpia import IndirectAttackEvaluator
+from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
+from ._evaluators._isa import ISAEvaluator
 from ._model_configurations import (
     AzureAIProject,
     AzureOpenAIModelConfiguration,
@@ -65,4 +67,6 @@
     "Conversation",
     "Message",
     "EvaluationResult",
+    "CodeVulnerabilityEvaluator",
+    "ISAEvaluator",
 ]
@@ -39,6 +39,8 @@ class Tasks:
     PROTECTED_MATERIAL = "protected material"
     XPIA = "xpia"
     GROUNDEDNESS = "groundedness"
+    CODE_VULNERABILITY = "code vulnerability"
+    ISA = "inference sensitive attributes"
 
 
 class _InternalAnnotationTasks:
@@ -61,6 +63,8 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     PROTECTED_MATERIAL = "protected_material"
     XPIA = "xpia"
     GROUNDEDNESS = "generic_groundedness"
+    CODE_VULNERABILITY = "code_vulnerability"
+    ISA = "inference_sensitive_attributes"
 
 
 class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):

@@ -64,6 +64,19 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
             "context": data.get("context", ""),
         }
         return json.dumps(as_dict)
+    if annotation_task == Tasks.CODE_VULNERABILITY:
+        as_dict = {
+            "context": data.get("query", ""),
+            "completion": data.get("response", "")
+        }
+        return json.dumps(as_dict)
+    if annotation_task == Tasks.ISA:
+        as_dict = {
+            "query": data.get("query", ""),
+            "response": data.get("response", ""),
+            "context": data.get("context", "")
+        }
+        return json.dumps(as_dict)
     as_dict = {
         "query": html.escape(data.get("query", "")),
         "response": html.escape(data.get("response", "")),
@@ -160,6 +173,8 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
     task = annotation_task
     if metric == EvaluationMetrics.PROTECTED_MATERIAL:
         include_metric = False
+    elif metric == EvaluationMetrics.ISA:
+        include_metric = False
     elif metric == _InternalEvaluationMetrics.ECI:
         include_metric = False
     elif metric == EvaluationMetrics.XPIA:
@@ -274,6 +289,8 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
         EvaluationMetrics.PROTECTED_MATERIAL,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        EvaluationMetrics.CODE_VULNERABILITY,
+        EvaluationMetrics.ISA,
     }:
         result = {}
         if not batch_response or len(batch_response[0]) == 0:
@@ -313,6 +330,13 @@ def parse_response(  # pylint: disable=too-many-branches,too-many-statements
             result[metric_display_name + "_information_gathering"] = (
                 parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
             )
+        if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.ISA:
+            # Add all attributes under the details.
+            details = {}
+            for key, value in parsed_response.items():
+                if key not in {"label", "reasoning", "version"}:
+                    details[key.replace("-", "_")] = value
+            result[metric_display_name + "_details"] = details
         return result
     return _parse_content_harm_response(batch_response, metric_name, metric_display_name)
 

@@ -26,6 +26,8 @@ class EvaluationMetrics:
     FICTIONAL_CHARACTERS = "fictional_characters"
     LOGOS_AND_BRANDS = "logos_and_brands"
     XPIA = "xpia"
+    CODE_VULNERABILITY = "code_vulnerability"
+    ISA = "inference_sensitive_attributes"
 
 
 class _InternalEvaluationMetrics:

@@ -152,12 +152,17 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         EvaluationMetrics.LOGOS_AND_BRANDS,
         _InternalEvaluationMetrics.ECI,
         EvaluationMetrics.XPIA,
+        EvaluationMetrics.CODE_VULNERABILITY,
+        EvaluationMetrics.ISA,
     ]
     label_cols = []
+    details_cols = []
     for col in df.columns:
         metric_name = col.split(".")[1]
         if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
             label_cols.append(col)
+        if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:    
+            details_cols = col
 
     label_df = df[label_cols]
     defect_rates = {}
@@ -169,8 +174,30 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
         except EvaluationException:  # only exception that can be cause is all NaN values
             msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
             LOGGER.warning(msg)
+
+    if details_cols:
+        details_df = df[details_cols]
+        detail_defect_rates = {}
+
+        for key, value in details_df.items():
+            _process_rows(value, detail_defect_rates)
+
+        for key, value in detail_defect_rates.items():
+            col_with_boolean_values = pd.to_numeric(value, errors="coerce")
+            try:
+                defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
+            except EvaluationException:  # only exception that can be cause is all NaN values
+                msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
+                LOGGER.warning(msg)
+
     return label_cols, defect_rates
 
+def _process_rows(row, detail_defect_rates):
+    for key, value in row.items():
+        if key not in detail_defect_rates:
+            detail_defect_rates[key] = []
+        detail_defect_rates[key].append(value)
+    return detail_defect_rates
 
 def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
     """Aggregate metrics from the evaluation results.

@@ -0,0 +1,5 @@
+from ._code_vulnerability import CodeVulnerabilityEvaluator
+
+__all__ = [
+    "CodeVulnerabilityEvaluator",
+]
@@ -0,0 +1,120 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing_extensions import overload, override
+from typing import Union
+
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+
+# cspell:ignore ssrf, vuln
+@experimental
+class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+    """
+    Evaluates code vulnerability for a given query and response for a single-turn evaluation only, 
+    where query represents the user query or code before the completion, and response represents the code recommended by the assistant.
+
+    The code vulnerability evaluation checks for vulnerabilities in the following coding languages:
+
+    - Python
+    - Java
+    - C++
+    - C#
+    - Go
+    - Javascript
+    - SQL
+
+    The code vulnerability evaluation identifies the following vulnerabilities:
+
+    - path-injection
+    - sql-injection
+    - code-injection
+    - stack-trace-exposure
+    - incomplete-url-substring-sanitization
+    - flask-debug
+    - clear-text-logging-sensitive-data
+    - incomplete-hostname-regexp
+    - server-side-unvalidated-url-redirection
+    - weak-cryptographic-algorithm
+    - full-ssrf
+    - bind-socket-all-network-interfaces
+    - client-side-unvalidated-url-redirection
+    - likely-bugs
+    - reflected-xss
+    - clear-text-storage-sensitive-data
+    - tarslip
+    - hardcoded-credentials
+    - insecure-randomness
+
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+
+    .. admonition:: Example:
+
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START code_vulnerability_evaluator]
+            :end-before: [END code_vulnerability_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.
+
+    .. note::
+
+        If this evaluator is supplied to the `evaluate` function, the metric
+        for the code vulnerability will be "code_vulnerability_label".
+    """
+
+    id = "code_vulnerability"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+    ):
+        super().__init__(
+            eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+        )
+
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+    ): 
+        """Evaluate a given query/response pair for code vulnerability
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :return: The code vulnerability label.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only. 
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: Optional[str]
+        :keyword response: The response to be evaluated.
+        :paramtype response: Optional[str]
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+        return super().__call__(*args, **kwargs)
@@ -168,4 +168,8 @@ def _get_task(self):
             return _InternalAnnotationTasks.ECI
         if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
             return Tasks.PROTECTED_MATERIAL
+        if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
+            return Tasks.CODE_VULNERABILITY
+        if self._eval_metric == EvaluationMetrics.ISA:
+            return Tasks.ISA
         return Tasks.CONTENT_HARM
@@ -0,0 +1,5 @@
+from ._isa import ISAEvaluator
+
+__all__ = [
+    "ISAEvaluator",
+]
@@ -0,0 +1,102 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing_extensions import overload, override
+from typing import Union
+
+from azure.ai.evaluation._common._experimental import experimental
+from azure.ai.evaluation._common.constants import EvaluationMetrics
+from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase
+
+@experimental
+class ISAEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
+    """
+    Evaluates ungrounded inference of sensitive attributes for a given query, response, and context for a single-turn 
+    evaluation only, where query represents the user query and response represents the AI system response given the provided context. 
+
+    Inference of Sensitive Attribute checks for whether a response is first, ungrounded, and checks if it contains information 
+    about protected class or emotional state of someone.
+
+    The inference of sensitive attributes evaluation identifies the following vulnerabilities:
+
+    - emotional_state
+    - protected_class
+    - groundedness
+
+    :param credential: The credential for connecting to Azure AI project. Required
+    :type credential: ~azure.core.credentials.TokenCredential
+    :param azure_ai_project: The scope of the Azure AI project.
+        It contains subscription id, resource group, and project name.
+    :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
+    :param kwargs: Additional arguments to pass to the evaluator.
+    :type kwargs: Any
+
+    .. admonition:: Example:
+
+        .. literalinclude:: ../samples/evaluation_samples_evaluate.py
+            :start-after: [START isa_evaluator]
+            :end-before: [END isa_evaluator]
+            :language: python
+            :dedent: 8
+            :caption: Initialize and call a ISAEvaluator with a query, response and context.
+
+    .. note::
+
+        If this evaluator is supplied to the `evaluate` function, the metric
+        for the inference of sensitive attributes will be "inference_sensitive_attributes_label".
+    """
+
+    id = "inference_sensitive_attributes"
+    """Evaluator identifier, experimental and to be used only with evaluation in cloud."""
+
+    @override
+    def __init__(
+        self,
+        credential,
+        azure_ai_project,
+    ):
+        super().__init__(
+            eval_metric=EvaluationMetrics.ISA,
+            azure_ai_project=azure_ai_project,
+            credential=credential,
+        )
+
+    @overload
+    def __call__(
+        self,
+        *,
+        query: str,
+        response: str,
+        context: str = None,
+    ): 
+        """Evaluate a given query/response pair and context for inference of sensitive attributes
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be used for evaluation.
+        :paramtype context: str
+        :return: The inference of sensitive attributes label.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+    @override
+    def __call__(  # pylint: disable=docstring-missing-param
+        self,
+        *args,
+        **kwargs,
+    ):
+        """Evaluate a given query/response pair and context for inference of sensitive attributes
+
+        :keyword query: The query to be evaluated.
+        :paramtype query: str
+        :keyword response: The response to be evaluated.
+        :paramtype response: str
+        :keyword context: The context to be used for evaluation.
+        :paramtype context: str
+        :return: The inference of sensitive attributes label.
+        :rtype: Dict[str, Union[str, bool]]
+        """
+
+        return super().__call__(*args, **kwargs)