Updated task type and artifact download for evaluation (#32734)

singankit · web-flow · commit 285d17533efa · 2023-10-25T12:46:13.000-07:00
* Evaluate updated task types

* New task type and downloading artifacts to output_path

* Evaluate updated task types

* New task type and downloading artifacts to output_path

* Spell check fix
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/__init__.py
@@ -6,11 +6,13 @@
 
 try:
     from ._evaluate import evaluate
+    from ._evaluation_result import EvaluationResult
 except ModuleNotFoundError as ex:
     print("Please make sure evaluate extras is installed. Please run the following command to install "
           "azure-ai-generative[evaluate]")
     raise ex
 
 __all__ = [
-    "evaluate"
+    "evaluate",
+    "EvaluationResult"
 ]
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_base_handler.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_base_handler.py
@@ -1,8 +1,6 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-from ._metric_handler import MetricHandler
-from ._utils import _has_column
 import abc
 import pandas as pd
 
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_constants.py
@@ -4,9 +4,19 @@
 
 from azureml.metrics import constants
 
-CHAT = "chat"
+QA = "qa"
+QA_RAG = "qa-rag"
+CHAT_RAG = "chat-rag"
+
+SUPPORTED_TASK_TYPE = [QA, QA_RAG, CHAT_RAG]
+
+SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING = {
+    "qa": constants.QUESTION_ANSWERING,
+    "qa-rag": constants.RAG_EVALUATION,
+    "chat-rag": constants.RAG_EVALUATION,
+}
 
 TYPE_TO_KWARGS_MAPPING = {
     constants.QUESTION_ANSWERING: ["questions", "contexts", "y_pred", "y_test"],
-    CHAT: ["y_pred"]
+    constants.RAG_EVALUATION: ["y_pred"]
 }
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluate.py
@@ -11,8 +11,6 @@
 
 import mlflow
 import pandas as pd
-from azureml.metrics import constants
-from azure.ai.generative.evaluate._constants import CHAT
 
 from mlflow.entities import Metric
 from mlflow.exceptions import MlflowException
@@ -21,6 +19,9 @@
 from azure.ai.generative.evaluate._metric_handler import MetricHandler
 from azure.ai.generative.evaluate._utils import _is_flow, load_jsonl, _get_artifact_dir_path
 from azure.ai.generative.evaluate._mlflow_log_collector import RedirectUserOutputStreams
+from azure.ai.generative.evaluate._constants import SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING, SUPPORTED_TASK_TYPE, QA_RAG, \
+    CHAT_RAG
+from azure.ai.generative.evaluate._evaluation_result import EvaluationResult
 
 from ._utils import _write_properties_to_run_history
 
@@ -85,6 +86,7 @@ def evaluate(
         metrics_list=None,
         model_config=None,
         data_mapping=None,
+        output_path=None,
         **kwargs
 ):
     results_list = []
@@ -117,6 +119,7 @@ def evaluate(
                     data_mapping=data_mapping,
                     params_dict=params_permutations_dict,
                     metrics=metrics_list,
+                    output_path=output_path,
                     **kwargs
                 )
             results_list.append(evaluation_results)
@@ -130,6 +133,7 @@ def evaluate(
             model_config=model_config,
             data_mapping=data_mapping,
             metrics=metrics_list,
+            output_path=output_path,
             **kwargs
         )
 
@@ -146,6 +150,7 @@ def _evaluate(
         metrics=None,
         data_mapping=None,
         model_config=None,
+        output_path=None,
         **kwargs
 ):
     try:
@@ -166,7 +171,7 @@ def _evaluate(
     if target is None and prediction_data is None:
         raise Exception("target and prediction data cannot be null")
 
-    if task_type not in [constants.Tasks.QUESTION_ANSWERING, CHAT]:
+    if task_type not in SUPPORTED_TASK_TYPE:
         raise Exception(f"task type {task_type} is not supported")
 
     metrics_config = {}
@@ -195,7 +200,7 @@ def _evaluate(
         )
 
         metrics_handler = MetricHandler(
-            task_type=task_type,
+            task_type=SUPPORTED_TO_METRICS_TASK_TYPE_MAPPING[task_type],
             metrics=metrics,
             prediction_data=asset_handler.prediction_data,
             truth_data=asset_handler.ground_truth,
@@ -209,7 +214,7 @@ def _evaluate(
 
         def _get_instance_table():
             metrics.get("artifacts").pop("bertscore", None)
-            if task_type == "chat":
+            if task_type in [QA_RAG, CHAT_RAG]:
                 instance_level_metrics_table = _get_chat_instance_table(metrics.get("artifacts"))
             else:
                 instance_level_metrics_table = pd.DataFrame(metrics.get("artifacts"))
@@ -270,7 +275,19 @@ def _get_instance_table():
             mlflow.log_param("task_type", task_type)
             log_param_and_tag("_azureml.evaluate_metric_mapping", json.dumps(metrics_handler._metrics_mapping_to_log))
 
-    return metrics
+    evaluation_result = EvaluationResult(
+        metrics_summary=metrics.get("metrics"),
+        artifacts={
+            "eval_results.jsonl": f"runs:/{run.info.run_id}/eval_results.jsonl"
+        },
+        tracking_uri=kwargs.get("tracking_uri"),
+        evaluation_id=run.info.run_id
+    )
+    if output_path:
+        evaluation_result.download_evaluation_artifacts(path=output_path)
+
+    return evaluation_result
+
 
 
 def log_input(data, data_is_file):
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluation_result.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_evaluation_result.py
@@ -0,0 +1,30 @@
+
+
+class EvaluationResult(object):
+
+    def __init__(self, metrics_summary: dict, artifacts: dict, **kwargs):
+        self._metrics_summary = metrics_summary
+        self._artifacts = artifacts
+        self._tracking_uri = kwargs.get("tracking_uri")
+        self._evaluation_id = kwargs.get("evaluation_id")
+
+    @property
+    def metrics_summary(self) -> dict[str: float]:
+        return self._metrics_summary
+
+    @property
+    def artifacts(self) -> dict[str, str]:
+        return self._artifacts
+
+    @property
+    def tracking_uri(self) -> str:
+        return self._tracking_uri
+
+    def download_evaluation_artifacts(self, path: str) -> str:
+        from mlflow.artifacts import download_artifacts
+        for artifact, artifact_uri in self.artifacts.items():
+            download_artifacts(
+                artifact_uri=artifact_uri,
+                tracking_uri=self.tracking_uri,
+                dst_path=path
+            )
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py b/sdk/ai/azure-ai-generative/azure/ai/generative/evaluate/_metric_handler.py
@@ -3,7 +3,7 @@
 # ---------------------------------------------------------
 import copy
 
-from azure.ai.generative.evaluate._constants import TYPE_TO_KWARGS_MAPPING
+from azure.ai.generative.evaluate._constants import TYPE_TO_KWARGS_MAPPING, CHAT_RAG, QA_RAG
 
 
 class MetricHandler(object):
@@ -64,8 +64,8 @@ def _get_data_for_metrics(self):
                             data_column: data_source[metrics_mapping[data_column]].values.tolist()
                         }
                     )
-                poped_value = metrics_mapping.pop(data_column, None)
-                metrics_mapping_to_log[data_column] = poped_value
+                popped_value = metrics_mapping.pop(data_column, None)
+                metrics_mapping_to_log[data_column] = popped_value
 
         metrics_data.update(metrics_mapping)
 
@@ -75,14 +75,13 @@ def _get_data_for_metrics(self):
 
     def calculate_metrics(self):
         from azureml.metrics import compute_metrics, constants
-        from ._constants import CHAT
 
         metrics_calculation_data = self._get_data_for_metrics()
 
-        metrics = self.metrics if self.task_type != CHAT else [] 
+        metrics = self.metrics if self.task_type == constants.RAG_EVALUATION and self.metrics is not None else []
 
         return compute_metrics(
             metrics=metrics,
-            task_type=constants.Tasks.RAG_EVALUATION if self.task_type == CHAT else self.task_type,
+            task_type=self.task_type,
             **metrics_calculation_data,
         )