Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/evaluation/azure-ai-evaluation/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/evaluation/azure-ai-evaluation",
"Tag": "python/evaluation/azure-ai-evaluation_2840d9d130"
"Tag": "python/evaluation/azure-ai-evaluation_83a7766f56"
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from ._evaluators._rouge import RougeScoreEvaluator, RougeType
from ._evaluators._similarity import SimilarityEvaluator
from ._evaluators._xpia import IndirectAttackEvaluator
from ._evaluators._code_vulnerability import CodeVulnerabilityEvaluator
from ._evaluators._isa import ISAEvaluator
from ._model_configurations import (
AzureAIProject,
AzureOpenAIModelConfiguration,
Expand Down Expand Up @@ -65,4 +67,6 @@
"Conversation",
"Message",
"EvaluationResult",
"CodeVulnerabilityEvaluator",
"ISAEvaluator",
]
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class Tasks:
PROTECTED_MATERIAL = "protected material"
XPIA = "xpia"
GROUNDEDNESS = "groundedness"
CODE_VULNERABILITY = "code vulnerability"
ISA = "inference sensitive attributes"


class _InternalAnnotationTasks:
Expand All @@ -61,6 +63,8 @@ class EvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
PROTECTED_MATERIAL = "protected_material"
XPIA = "xpia"
GROUNDEDNESS = "generic_groundedness"
CODE_VULNERABILITY = "code_vulnerability"
ISA = "inference_sensitive_attributes"


class _InternalEvaluationMetrics(str, Enum, metaclass=CaseInsensitiveEnumMeta):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,19 @@ def get_formatted_template(data: dict, annotation_task: str) -> str:
"context": data.get("context", ""),
}
return json.dumps(as_dict)
if annotation_task == Tasks.CODE_VULNERABILITY:
as_dict = {
"context": data.get("query", ""),
"completion": data.get("response", "")
}
return json.dumps(as_dict)
if annotation_task == Tasks.ISA:
as_dict = {
"query": data.get("query", ""),
"response": data.get("response", ""),
"context": data.get("context", "")
}
return json.dumps(as_dict)
as_dict = {
"query": html.escape(data.get("query", "")),
"response": html.escape(data.get("response", "")),
Expand Down Expand Up @@ -160,6 +173,8 @@ def generate_payload(normalized_user_text: str, metric: str, annotation_task: st
task = annotation_task
if metric == EvaluationMetrics.PROTECTED_MATERIAL:
include_metric = False
elif metric == EvaluationMetrics.ISA:
include_metric = False
elif metric == _InternalEvaluationMetrics.ECI:
include_metric = False
elif metric == EvaluationMetrics.XPIA:
Expand Down Expand Up @@ -274,6 +289,8 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
EvaluationMetrics.PROTECTED_MATERIAL,
_InternalEvaluationMetrics.ECI,
EvaluationMetrics.XPIA,
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.ISA,
}:
result = {}
if not batch_response or len(batch_response[0]) == 0:
Expand Down Expand Up @@ -313,6 +330,13 @@ def parse_response( # pylint: disable=too-many-branches,too-many-statements
result[metric_display_name + "_information_gathering"] = (
parsed_response["information_gathering"] if "information_gathering" in parsed_response else math.nan
)
if metric_name == EvaluationMetrics.CODE_VULNERABILITY or metric_name == EvaluationMetrics.ISA:
# Add all attributes under the details.
details = {}
for key, value in parsed_response.items():
if key not in {"label", "reasoning", "version"}:
details[key.replace("-", "_")] = value
result[metric_display_name + "_details"] = details
return result
return _parse_content_harm_response(batch_response, metric_name, metric_display_name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class EvaluationMetrics:
FICTIONAL_CHARACTERS = "fictional_characters"
LOGOS_AND_BRANDS = "logos_and_brands"
XPIA = "xpia"
CODE_VULNERABILITY = "code_vulnerability"
ISA = "inference_sensitive_attributes"


class _InternalEvaluationMetrics:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,17 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
EvaluationMetrics.LOGOS_AND_BRANDS,
_InternalEvaluationMetrics.ECI,
EvaluationMetrics.XPIA,
EvaluationMetrics.CODE_VULNERABILITY,
EvaluationMetrics.ISA,
]
label_cols = []
details_cols = []
for col in df.columns:
metric_name = col.split(".")[1]
if metric_name.endswith("_label") and metric_name.replace("_label", "").lower() in handled_metrics:
label_cols.append(col)
if metric_name.endswith("_details") and metric_name.replace("_details", "").lower() in handled_metrics:
details_cols = col

label_df = df[label_cols]
defect_rates = {}
Expand All @@ -169,8 +174,30 @@ def _aggregate_label_defect_metrics(df: pd.DataFrame) -> Tuple[List[str], Dict[s
except EvaluationException: # only exception that can be cause is all NaN values
msg = f"All score evaluations are NaN/None for column {col}. No aggregation can be performed."
LOGGER.warning(msg)

if details_cols:
details_df = df[details_cols]
detail_defect_rates = {}

for key, value in details_df.items():
_process_rows(value, detail_defect_rates)

for key, value in detail_defect_rates.items():
col_with_boolean_values = pd.to_numeric(value, errors="coerce")
try:
defect_rates[f"{details_cols}.{key}_defect_rate"] = round(list_mean_nan_safe(col_with_boolean_values), 2)
except EvaluationException: # only exception that can be cause is all NaN values
msg = f"All score evaluations are NaN/None for column {key}. No aggregation can be performed."
LOGGER.warning(msg)

return label_cols, defect_rates

def _process_rows(row, detail_defect_rates):
for key, value in row.items():
if key not in detail_defect_rates:
detail_defect_rates[key] = []
detail_defect_rates[key].append(value)
return detail_defect_rates

def _aggregate_metrics(df: pd.DataFrame, evaluators: Dict[str, Callable]) -> Dict[str, float]:
"""Aggregate metrics from the evaluation results.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ._code_vulnerability import CodeVulnerabilityEvaluator

__all__ = [
"CodeVulnerabilityEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing_extensions import overload, override
from typing import Union

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase

# cspell:ignore ssrf, vuln
@experimental
class CodeVulnerabilityEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
"""
Evaluates code vulnerability for a given query and response for a single-turn evaluation only,
where query represents the user query or code before the completion, and response represents the code recommended by the assistant.

The code vulnerability evaluation checks for vulnerabilities in the following coding languages:

- Python
- Java
- C++
- C#
- Go
- Javascript
- SQL

The code vulnerability evaluation identifies the following vulnerabilities:

- path-injection
- sql-injection
- code-injection
- stack-trace-exposure
- incomplete-url-substring-sanitization
- flask-debug
- clear-text-logging-sensitive-data
- incomplete-hostname-regexp
- server-side-unvalidated-url-redirection
- weak-cryptographic-algorithm
- full-ssrf
- bind-socket-all-network-interfaces
- client-side-unvalidated-url-redirection
- likely-bugs
- reflected-xss
- clear-text-storage-sensitive-data
- tarslip
- hardcoded-credentials
- insecure-randomness

:param credential: The credential for connecting to Azure AI project. Required
:type credential: ~azure.core.credentials.TokenCredential
:param azure_ai_project: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any

.. admonition:: Example:

.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START code_vulnerability_evaluator]
:end-before: [END code_vulnerability_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a CodeVulnerabilityEvaluator with a query and response.

.. note::

If this evaluator is supplied to the `evaluate` function, the metric
for the code vulnerability will be "code_vulnerability_label".
"""

id = "code_vulnerability"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(
self,
credential,
azure_ai_project,
):
super().__init__(
eval_metric=EvaluationMetrics.CODE_VULNERABILITY,
azure_ai_project=azure_ai_project,
credential=credential,
)

@overload
def __call__(
self,
*,
query: str,
response: str,
):
"""Evaluate a given query/response pair for code vulnerability

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:return: The code vulnerability label.
:rtype: Dict[str, Union[str, bool]]
"""

@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""Evaluate code vulnerability. Accepts query and response for a single-turn evaluation only.

:keyword query: The query to be evaluated.
:paramtype query: Optional[str]
:keyword response: The response to be evaluated.
:paramtype response: Optional[str]
:rtype: Dict[str, Union[str, bool]]
"""

return super().__call__(*args, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -168,4 +168,8 @@ def _get_task(self):
return _InternalAnnotationTasks.ECI
if self._eval_metric == EvaluationMetrics.PROTECTED_MATERIAL:
return Tasks.PROTECTED_MATERIAL
if self._eval_metric == EvaluationMetrics.CODE_VULNERABILITY:
return Tasks.CODE_VULNERABILITY
if self._eval_metric == EvaluationMetrics.ISA:
return Tasks.ISA
return Tasks.CONTENT_HARM
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from ._isa import ISAEvaluator

__all__ = [
"ISAEvaluator",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
from typing_extensions import overload, override
from typing import Union

from azure.ai.evaluation._common._experimental import experimental
from azure.ai.evaluation._common.constants import EvaluationMetrics
from azure.ai.evaluation._evaluators._common import RaiServiceEvaluatorBase

@experimental
class ISAEvaluator(RaiServiceEvaluatorBase[Union[str, bool]]):
"""
Evaluates ungrounded inference of sensitive attributes for a given query, response, and context for a single-turn
evaluation only, where query represents the user query and response represents the AI system response given the provided context.

Inference of Sensitive Attribute checks for whether a response is first, ungrounded, and checks if it contains information
about protected class or emotional state of someone.

The inference of sensitive attributes evaluation identifies the following vulnerabilities:

- emotional_state
- protected_class
- groundedness

:param credential: The credential for connecting to Azure AI project. Required
:type credential: ~azure.core.credentials.TokenCredential
:param azure_ai_project: The scope of the Azure AI project.
It contains subscription id, resource group, and project name.
:type azure_ai_project: ~azure.ai.evaluation.AzureAIProject
:param kwargs: Additional arguments to pass to the evaluator.
:type kwargs: Any

.. admonition:: Example:

.. literalinclude:: ../samples/evaluation_samples_evaluate.py
:start-after: [START isa_evaluator]
:end-before: [END isa_evaluator]
:language: python
:dedent: 8
:caption: Initialize and call a ISAEvaluator with a query, response and context.

.. note::

If this evaluator is supplied to the `evaluate` function, the metric
for the inference of sensitive attributes will be "inference_sensitive_attributes_label".
"""

id = "inference_sensitive_attributes"
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""

@override
def __init__(
self,
credential,
azure_ai_project,
):
super().__init__(
eval_metric=EvaluationMetrics.ISA,
azure_ai_project=azure_ai_project,
credential=credential,
)

@overload
def __call__(
self,
*,
query: str,
response: str,
context: str = None,
):
"""Evaluate a given query/response pair and context for inference of sensitive attributes

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword context: The context to be used for evaluation.
:paramtype context: str
:return: The inference of sensitive attributes label.
:rtype: Dict[str, Union[str, bool]]
"""

@override
def __call__( # pylint: disable=docstring-missing-param
self,
*args,
**kwargs,
):
"""Evaluate a given query/response pair and context for inference of sensitive attributes

:keyword query: The query to be evaluated.
:paramtype query: str
:keyword response: The response to be evaluated.
:paramtype response: str
:keyword context: The context to be used for evaluation.
:paramtype context: str
:return: The inference of sensitive attributes label.
:rtype: Dict[str, Union[str, bool]]
"""

return super().__call__(*args, **kwargs)
Loading