update

microsoft · Apr 24, 2024 · ccf1ee9 · ccf1ee9
1 parent 8a1d6c6
commit ccf1ee9
Show file tree

Hide file tree

Showing 9 changed files with 209 additions and 192 deletions.
diff --git a/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py b/src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
@@ -64,41 +64,65 @@ def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_
     for evaluator_name, evaluator in evaluators.items():
         # Apply column mapping
         mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
-        renamed_data_df = _apply_column_mapping(data_df, "data", mapping_config)
+        new_data_df = _apply_column_mapping(data_df, mapping_config)
 
         # Validate input data for evaluator
-        _validate_input_data_for_evaluator(evaluator, evaluator_name, renamed_data_df)
+        _validate_input_data_for_evaluator(evaluator, evaluator_name, new_data_df)
 
 
-def _apply_column_mapping(source_df, source_name, mapping_config, inplace=False):
-    SUPPORTED_SOURCE_NAMES = ["data", "target"]
-
+def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False):
+    """
+    Apply column mapping to source_df based on mapping_config.
+    This function is used for pre-validation of input data for evaluators
+    """
     result_df = source_df
+
     if mapping_config:
         column_mapping = {}
+        pattern_prefix = "data."
+
         for map_to_key, map_value in mapping_config.items():
             match = re.search(r"^\${([^{}]+)}$", map_value)
-
             if match is not None:
                 pattern = match.group(1)
-
-                # Check if source reference is valid
-                source_reference = pattern.split(".")[0]
-                if source_reference not in SUPPORTED_SOURCE_NAMES:
-                    raise ValueError(
-                        f"'{source_reference}' is not a valid source reference. "
-                        + f"It should be one of {SUPPORTED_SOURCE_NAMES}."
-                    )
-
-                if pattern.startswith(f"{source_name}."):
-                    map_from_key = pattern.split(f"{source_name}.")[1]
+                if pattern.startswith(pattern_prefix):
+                    map_from_key = pattern.split(pattern_prefix)[1]
                     column_mapping[map_from_key] = map_to_key
 
         result_df = source_df.rename(columns=column_mapping, inplace=inplace)
 
     return result_df
 
 
+def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]):
+    """Process evaluator_config to replace ${target.} with ${data.}"""
+
+    processed_config = {}
+
+    if evaluator_config is None:
+        return processed_config
+
+    unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")
+
+    for evaluator, mapping_config in evaluator_config.items():
+        if isinstance(mapping_config, dict):
+            processed_config[evaluator] = {}
+
+            for map_to_key, map_value in mapping_config.items():
+
+                # Check if there's any unexpected reference other than ${target.} or ${data.}
+                if unexpected_references.search(map_value):
+                    raise ValueError(
+                        "Unexpected references detected in 'evaluator_config'. "
+                        "Ensure only ${target.} and ${data.} are used."
+                    )
+
+                # Replace ${target.} with ${data.}
+                processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${data.")
+
+    return processed_config
+
+
 def evaluate(
     *,
     evaluation_name: Optional[str] = None,
@@ -129,16 +153,17 @@ def evaluate(
     :rtype: ~azure.ai.generative.evaluate.EvaluationResult
     """
 
+    evaluator_config = _process_evaluator_config(evaluator_config)
+
     _validation(target, data, evaluators, output_path, tracking_uri, evaluation_name, evaluator_config)
 
     pf_client = PFClient()
 
     evaluator_info = {}
 
     for evaluator_name, evaluator in evaluators.items():
-        evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}})
-
-        evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run(
+        evaluator_info[evaluator_name] = {}
+        evaluator_info[evaluator_name]["run"] = pf_client.run(
             flow=evaluator,
             column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
             data=data,
@@ -147,7 +172,7 @@ def evaluate(
 
     evaluators_result_df = None
     for evaluator_name, evaluator_info in evaluator_info.items():
-        evaluator_result_df = evaluator_info["client"].get_details(evaluator_info["run"], all_results=True)
+        evaluator_result_df = pf_client.get_details(evaluator_info["run"], all_results=True)
 
         # drop input columns
         evaluator_result_df = evaluator_result_df.drop(

diff --git a/...tests/data/column_mapping_test_data.jsonl → ...aluate_test_data_for_column_mapping.jsonl b/...tests/data/column_mapping_test_data.jsonl → ...aluate_test_data_for_column_mapping.jsonl
diff --git a/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py b/src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
@@ -16,9 +16,9 @@ def data_file():
 
 
 @pytest.fixture
-def column_mapping_data_file():
+def data_file_for_column_mapping():
     data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
-    return os.path.join(data_path, "column_mapping_test_data.jsonl")
+    return os.path.join(data_path, "evaluate_test_data_for_column_mapping.jsonl")
 
 
 def answer_evaluator(answer):
@@ -86,23 +86,23 @@ def test_evaluate_python_function(self, data_file):
         assert metrics.get("answer.length") == np.nanmean(row_result_df["outputs.answer.length"])
         assert row_result_df["outputs.answer.length"][2] == 31
 
-    @pytest.mark.usefixtures("column_mapping_data_file")
-    def test_evaluate_with_column_mapping(self, column_mapping_data_file):
+    @pytest.mark.usefixtures("data_file_for_column_mapping")
+    def test_evaluate_with_evaluator_config(self, data_file_for_column_mapping):
         # data
-        input_data = pd.read_json(column_mapping_data_file, lines=True)
+        input_data = pd.read_json(data_file_for_column_mapping, lines=True)
         f1_score_evaluator = F1ScoreEvaluator()
 
         # run the evaluation
         result = evaluate(
-            data=column_mapping_data_file,
+            data=data_file_for_column_mapping,
             evaluators={"f1_score": f1_score_evaluator, "answer": answer_evaluator},
             evaluator_config={
                 "f1_score": {
                     "ground_truth": "${data.ground_truth}",
                     "answer": "${data.response}",
                 },
                 "answer": {
-                    "answer": "${data.response}",
+                    "answer": "${target.response}",
                 },
             },
         )

diff --git a/...test_configs/generated_qa_chat_conv.jsonl → ...ittests/data/generated_qa_chat_conv.jsonl b/...test_configs/generated_qa_chat_conv.jsonl → ...ittests/data/generated_qa_chat_conv.jsonl
@@ -1,5 +1,5 @@
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}]}
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"},{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"},{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
diff --git a/...est_configs/generated_qa_chat_short.jsonl → ...ttests/data/generated_qa_chat_short.jsonl b/...est_configs/generated_qa_chat_short.jsonl → ...ttests/data/generated_qa_chat_short.jsonl
@@ -1,5 +1,5 @@
-{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
-{"messages":[{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
-{"messages":[{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}]}
-{"messages":[{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
-{"messages":[{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
+{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
+{"messages":[{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
+{"messages":[{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}]}
+{"messages":[{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
+{"messages":[{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
diff --git a/...s/test_configs/generated_qa_pf_conv.jsonl → ...unittests/data/generated_qa_pf_conv.jsonl b/...s/test_configs/generated_qa_pf_conv.jsonl → ...unittests/data/generated_qa_pf_conv.jsonl
@@ -1,5 +1,5 @@
-{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
-{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}}],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
-{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}}],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}
-{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}}],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
-{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}},{"inputs":{"question":"Is K8s also a compute?"},"outputs":{"ground_truth":"Yes.\n"}}],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
+{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
+{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}}],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
+{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}}],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}
+{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}}],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
+{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}},{"inputs":{"question":"Is K8s also a compute?"},"outputs":{"ground_truth":"Yes.\n"}}],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
diff --git a/.../test_configs/generated_qa_pf_short.jsonl → ...nittests/data/generated_qa_pf_short.jsonl b/.../test_configs/generated_qa_pf_short.jsonl → ...nittests/data/generated_qa_pf_short.jsonl
@@ -1,5 +1,5 @@
-{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
-{"chat_history":[],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
-{"chat_history":[],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}
-{"chat_history":[],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
-{"chat_history":[],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
+{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
+{"chat_history":[],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
+{"chat_history":[],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n   Here's one more reason ..."}
+{"chat_history":[],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
+{"chat_history":[],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
diff --git a/src/promptflow-evals/tests/evals/unittests/test_evaluate.py b/src/promptflow-evals/tests/evals/unittests/test_evaluate.py
@@ -64,7 +64,7 @@ def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file):
 
         assert "Missing required inputs for evaluator g : ['ground_truth']." in exc_info.value.args[0]
 
-    def test_apply_column_mapping_normal(self):
+    def test_apply_column_mapping(self):
         json_data = [
             {
                 "question": "How are you?",
@@ -77,31 +77,23 @@ def test_apply_column_mapping_normal(self):
         }
 
         data_df = pd.DataFrame(json_data)
-        new_data_df = _apply_column_mapping(data_df, "data", inputs_mapping)
+        new_data_df = _apply_column_mapping(data_df, inputs_mapping)
 
         assert "question" in new_data_df.columns
         assert "answer" in new_data_df.columns
 
         assert new_data_df["question"][0] == "How are you?"
         assert new_data_df["answer"][0] == "I'm fine"
 
-    def test_apply_column_mapping_invalid_source_reference(self):
-        json_data = [
-            {
-                "question": "How are you?",
-                "ground_truth": "I'm fine",
-            }
-        ]
-        inputs_mapping = {
-            "question": "${foo.question}",
-            "answer": "${foo.ground_truth}",
-        }
-
-        data_df = pd.DataFrame(json_data)
-
+    def test_evaluate_invalid_evaluator_config(self, mock_model_config):
         with pytest.raises(ValueError) as exc_info:
-            _apply_column_mapping(data_df, "data", inputs_mapping)
+            evaluate(
+                data="data.jsonl",
+                evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
+                evaluator_config={"g": {"question": "${foo.question}"}},
+            )
 
         assert (
-            "'foo' is not a valid source reference. It should be one of ['data', 'target']." in exc_info.value.args[0]
+            "Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
+            in exc_info.value.args[0]
         )