Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
ninghu committed Apr 24, 2024
1 parent 8a1d6c6 commit ccf1ee9
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 192 deletions.
67 changes: 46 additions & 21 deletions src/promptflow-evals/promptflow/evals/evaluate/_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,41 +64,65 @@ def _validation(target, data, evaluators, output_path, tracking_uri, evaluation_
for evaluator_name, evaluator in evaluators.items():
# Apply column mapping
mapping_config = evaluator_config.get(evaluator_name, evaluator_config.get("default", None))
renamed_data_df = _apply_column_mapping(data_df, "data", mapping_config)
new_data_df = _apply_column_mapping(data_df, mapping_config)

# Validate input data for evaluator
_validate_input_data_for_evaluator(evaluator, evaluator_name, renamed_data_df)
_validate_input_data_for_evaluator(evaluator, evaluator_name, new_data_df)


def _apply_column_mapping(source_df, source_name, mapping_config, inplace=False):
SUPPORTED_SOURCE_NAMES = ["data", "target"]

def _apply_column_mapping(source_df: pd.DataFrame, mapping_config: dict, inplace: bool = False):
"""
Apply column mapping to source_df based on mapping_config.
This function is used for pre-validation of input data for evaluators
"""
result_df = source_df

if mapping_config:
column_mapping = {}
pattern_prefix = "data."

for map_to_key, map_value in mapping_config.items():
match = re.search(r"^\${([^{}]+)}$", map_value)

if match is not None:
pattern = match.group(1)

# Check if source reference is valid
source_reference = pattern.split(".")[0]
if source_reference not in SUPPORTED_SOURCE_NAMES:
raise ValueError(
f"'{source_reference}' is not a valid source reference. "
+ f"It should be one of {SUPPORTED_SOURCE_NAMES}."
)

if pattern.startswith(f"{source_name}."):
map_from_key = pattern.split(f"{source_name}.")[1]
if pattern.startswith(pattern_prefix):
map_from_key = pattern.split(pattern_prefix)[1]
column_mapping[map_from_key] = map_to_key

result_df = source_df.rename(columns=column_mapping, inplace=inplace)

return result_df


def _process_evaluator_config(evaluator_config: Dict[str, Dict[str, str]]):
"""Process evaluator_config to replace ${target.} with ${data.}"""

processed_config = {}

if evaluator_config is None:
return processed_config

unexpected_references = re.compile(r"\${(?!target\.|data\.).+?}")

for evaluator, mapping_config in evaluator_config.items():
if isinstance(mapping_config, dict):
processed_config[evaluator] = {}

for map_to_key, map_value in mapping_config.items():

# Check if there's any unexpected reference other than ${target.} or ${data.}
if unexpected_references.search(map_value):
raise ValueError(
"Unexpected references detected in 'evaluator_config'. "
"Ensure only ${target.} and ${data.} are used."
)

# Replace ${target.} with ${data.}
processed_config[evaluator][map_to_key] = map_value.replace("${target.", "${data.")

return processed_config


def evaluate(
*,
evaluation_name: Optional[str] = None,
Expand Down Expand Up @@ -129,16 +153,17 @@ def evaluate(
:rtype: ~azure.ai.generative.evaluate.EvaluationResult
"""

evaluator_config = _process_evaluator_config(evaluator_config)

_validation(target, data, evaluators, output_path, tracking_uri, evaluation_name, evaluator_config)

pf_client = PFClient()

evaluator_info = {}

for evaluator_name, evaluator in evaluators.items():
evaluator_info.update({evaluator_name: {"client": pf_client, "evaluator": evaluator}})

evaluator_info[evaluator_name]["run"] = evaluator_info[evaluator_name]["client"].run(
evaluator_info[evaluator_name] = {}
evaluator_info[evaluator_name]["run"] = pf_client.run(
flow=evaluator,
column_mapping=evaluator_config.get(evaluator_name, evaluator_config.get("default", None)),
data=data,
Expand All @@ -147,7 +172,7 @@ def evaluate(

evaluators_result_df = None
for evaluator_name, evaluator_info in evaluator_info.items():
evaluator_result_df = evaluator_info["client"].get_details(evaluator_info["run"], all_results=True)
evaluator_result_df = pf_client.get_details(evaluator_info["run"], all_results=True)

# drop input columns
evaluator_result_df = evaluator_result_df.drop(
Expand Down
14 changes: 7 additions & 7 deletions src/promptflow-evals/tests/evals/e2etests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ def data_file():


@pytest.fixture
def column_mapping_data_file():
def data_file_for_column_mapping():
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
return os.path.join(data_path, "column_mapping_test_data.jsonl")
return os.path.join(data_path, "evaluate_test_data_for_column_mapping.jsonl")


def answer_evaluator(answer):
Expand Down Expand Up @@ -86,23 +86,23 @@ def test_evaluate_python_function(self, data_file):
assert metrics.get("answer.length") == np.nanmean(row_result_df["outputs.answer.length"])
assert row_result_df["outputs.answer.length"][2] == 31

@pytest.mark.usefixtures("column_mapping_data_file")
def test_evaluate_with_column_mapping(self, column_mapping_data_file):
@pytest.mark.usefixtures("data_file_for_column_mapping")
def test_evaluate_with_evaluator_config(self, data_file_for_column_mapping):
# data
input_data = pd.read_json(column_mapping_data_file, lines=True)
input_data = pd.read_json(data_file_for_column_mapping, lines=True)
f1_score_evaluator = F1ScoreEvaluator()

# run the evaluation
result = evaluate(
data=column_mapping_data_file,
data=data_file_for_column_mapping,
evaluators={"f1_score": f1_score_evaluator, "answer": answer_evaluator},
evaluator_config={
"f1_score": {
"ground_truth": "${data.ground_truth}",
"answer": "${data.response}",
},
"answer": {
"answer": "${data.response}",
"answer": "${target.response}",
},
},
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"},{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."},{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."},{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."},{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"},{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
{"messages":[{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
{"messages":[{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}]}
{"messages":[{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
{"messages":[{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
{"messages":[{"role":"user","content":"What is Compute Instance?"},{"role":"assistant","content":"Compute instance is ..."}]}
{"messages":[{"role":"user","content":"Is CI different than Compute Cluster?"},{"role":"assistant","content":"Yes."}]}
{"messages":[{"role":"user","content":"In what way?"},{"role":"assistant","content":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}]}
{"messages":[{"role":"user","content":"Is K8s also a compute?"},{"role":"assistant","content":"Yes.\n"}]}
{"messages":[{"role":"user","content":"Question after space?"},{"role":"assistant","content":"Answer after space.\n\n"}]}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}}],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}}],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}}],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}},{"inputs":{"question":"Is K8s also a compute?"},"outputs":{"ground_truth":"Yes.\n"}}],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}}],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}}],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}}],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
{"chat_history":[{"inputs":{"question":"What is Compute Instance?"},"outputs":{"ground_truth":"Compute instance is ..."}},{"inputs":{"question":"Is CI different than Compute Cluster?"},"outputs":{"ground_truth":"Yes."}},{"inputs":{"question":"In what way?"},"outputs":{"ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}},{"inputs":{"question":"Is K8s also a compute?"},"outputs":{"ground_truth":"Yes.\n"}}],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
{"chat_history":[],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
{"chat_history":[],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}
{"chat_history":[],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
{"chat_history":[],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
{"chat_history":[],"question":"What is Compute Instance?","ground_truth":"Compute instance is ..."}
{"chat_history":[],"question":"Is CI different than Compute Cluster?","ground_truth":"Yes."}
{"chat_history":[],"question":"In what way?","ground_truth":"It is different ... because ...\n... these are the reasons.\n Here's one more reason ..."}
{"chat_history":[],"question":"Is K8s also a compute?","ground_truth":"Yes.\n"}
{"chat_history":[],"question":"Question after space?","ground_truth":"Answer after space.\n\n"}
28 changes: 10 additions & 18 deletions src/promptflow-evals/tests/evals/unittests/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_evaluate_missing_required_inputs(self, missing_columns_jsonl_file):

assert "Missing required inputs for evaluator g : ['ground_truth']." in exc_info.value.args[0]

def test_apply_column_mapping_normal(self):
def test_apply_column_mapping(self):
json_data = [
{
"question": "How are you?",
Expand All @@ -77,31 +77,23 @@ def test_apply_column_mapping_normal(self):
}

data_df = pd.DataFrame(json_data)
new_data_df = _apply_column_mapping(data_df, "data", inputs_mapping)
new_data_df = _apply_column_mapping(data_df, inputs_mapping)

assert "question" in new_data_df.columns
assert "answer" in new_data_df.columns

assert new_data_df["question"][0] == "How are you?"
assert new_data_df["answer"][0] == "I'm fine"

def test_apply_column_mapping_invalid_source_reference(self):
json_data = [
{
"question": "How are you?",
"ground_truth": "I'm fine",
}
]
inputs_mapping = {
"question": "${foo.question}",
"answer": "${foo.ground_truth}",
}

data_df = pd.DataFrame(json_data)

def test_evaluate_invalid_evaluator_config(self, mock_model_config):
with pytest.raises(ValueError) as exc_info:
_apply_column_mapping(data_df, "data", inputs_mapping)
evaluate(
data="data.jsonl",
evaluators={"g": GroundednessEvaluator(model_config=mock_model_config)},
evaluator_config={"g": {"question": "${foo.question}"}},
)

assert (
"'foo' is not a valid source reference. It should be one of ['data', 'target']." in exc_info.value.args[0]
"Unexpected references detected in 'evaluator_config'. Ensure only ${target.} and ${data.} are used."
in exc_info.value.args[0]
)
Loading

0 comments on commit ccf1ee9

Please sign in to comment.