Skip to content

Commit

Permalink
fix/evaluations multiple tool pairing and empty utterance pairing
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabi Chueh committed Dec 6, 2024
1 parent 60b589e commit cb912be
Showing 1 changed file with 79 additions and 61 deletions.
140 changes: 79 additions & 61 deletions src/dfcx_scrapi/tools/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

import numpy as np
import pandas as pd
from itertools import zip_longest
from google.cloud.dialogflowcx_v3beta1 import types
from google.oauth2 import service_account
from tqdm import tqdm
Expand Down Expand Up @@ -171,33 +172,48 @@ def process_flow_invocations(

@staticmethod
def process_tool_invocations(
tool_responses: List[str],
tool_responses: List[Dict],
index: int,
row: pd.Series,
df: pd.DataFrame) -> pd.DataFrame:
# Check if our golden contained a tool_idx or wasn't
# expecting tools
df: pd.DataFrame,
) -> pd.DataFrame:
"""Process tool invocations and map them to the correct rows in the dataframe."""
# Get the list of indices where tool responses should be mapped
if row["tool_pair"] in [None, "", "NaN", "nan"]:
tool_index_list = [index]
else:
tool_index_list = literal_eval(row["tool_pair"])

for idx in tool_index_list:
tool = tool_responses.pop(0)
df.loc[
int(idx),
[
"res_tool_name",
"res_tool_action",
"res_input_params",
"res_output_params",
],
] = [
tool["tool_name"],
tool["tool_action"],
tool["input_params"],
tool["output_params"],
]
try:
# Convert string representation of list to actual list
tool_index_list = literal_eval(row["tool_pair"])
except (ValueError, SyntaxError):
return df

# If we have no tool responses but expected some
if not tool_responses and tool_index_list:
for idx in tool_index_list:
df.loc[int(idx), ["res_tool_name", "res_tool_action", "res_input_params", "res_output_params"]] = [
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE"
]
return df

# Process each tool response and map it to the corresponding index
for i, idx in enumerate(tool_index_list):
if i < len(tool_responses):
tool = tool_responses[i]
df.loc[int(idx), "res_tool_name"] = tool.get("tool_name", "")
df.loc[int(idx), "res_tool_action"] = tool.get("tool_action", "")
df.loc[int(idx), "res_input_params"] = str(tool.get("input_params", {}))
df.loc[int(idx), "res_output_params"] = str(tool.get("output_params", {}))
else:
df.loc[int(idx), ["res_tool_name", "res_tool_action", "res_input_params", "res_output_params"]] = [
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE",
"NO_TOOL_RESPONSE"
]

return df

Expand Down Expand Up @@ -403,14 +419,14 @@ def run_detect_intent_queries(self, df: pd.DataFrame) -> pd.DataFrame:
text=row["action_input"],
parameters=session_parameters
)

# Add data to the existing row
df.loc[index, ["session_id", "agent_id"]] = [
data["session_id"],
data["agent_id"],
]
text_res = self.ar._extract_text(res)
utterance_idx = int(row["utterance_pair"])

utterance_idx = int(row["utterance_pair"] or index) # if utterance_pair is empty/''/NaN, use index
df.loc[utterance_idx, ["agent_response"]] = [text_res]

# Handle Playbook Invocations
Expand All @@ -431,16 +447,13 @@ def run_detect_intent_queries(self, df: pd.DataFrame) -> pd.DataFrame:

# Handle Tool Invocations
if "tool_call_quality" in self.user_input_metrics:
tool_responses = (
self.sessions_client.collect_tool_responses(res)
)
if len(tool_responses) > 0:
df = self.process_tool_invocations(
tool_responses, index, row, df
)
tool_responses = self.sessions_client.collect_tool_responses(res)
if tool_responses: # Only call if not empty
df = self.process_tool_invocations(tool_responses, index, row, df)

return df


def run_evals(self, df: pd.DataFrame) -> pd.DataFrame:
print("Starting Evals...")

Expand Down Expand Up @@ -556,24 +569,24 @@ def get_matching_list_idx(a, b):

@staticmethod
def pair_utterances(df: pd.DataFrame) -> pd.DataFrame:
"Identifies pairings of user_utterance and agent_utterance by eval_id."
df["utterance_pair"] = pd.Series(dtype="string")
"""
Identifies pairings of user_utterance and agent_utterance by eval_id.
Handles cases where a user utterance has no corresponding agent response.
"""
df["utterance_pair"] = np.nan # Initialize with NaN for missing pairs

grouped = df.groupby("eval_id")

for _, group in grouped:
user = group[
group["action_type"] == "User Utterance"
].index.tolist()
agent = group[
group["action_type"] == "Agent Response"
].index.tolist()
pairs = list(
zip(user, agent)
)
user_utterances = group[group["action_type"] == "User Utterance"].index.tolist()
agent_responses = group[group["action_type"] == "Agent Response"].index.tolist()

# Create pairs of user/agent row indices
for pair in pairs:
df.loc[pair[0], "utterance_pair"] = str(pair[1])
# Use zip_longest to handle unequal list lengths
for user_idx, agent_idx in zip_longest(user_utterances, agent_responses):
if agent_idx is not None: # Check if agent response exists
df.loc[user_idx, "utterance_pair"] = str(agent_idx)
else: # Assign NaN if there is no agent_response
df.loc[user_idx, "utterance_pair"] = np.nan # or "NO_AGENT_RESPONSE" if needed

return df

Expand Down Expand Up @@ -602,27 +615,32 @@ def get_model_name(settings: types.GenerativeSettings) -> str:

return model_map.get(model_name, "")


def pair_tool_calls(self, df: pd.DataFrame) -> pd.DataFrame:
"Identifies pairings of agent_utterance/tool_invocation by eval_id."
df["tool_pair"] = pd.Series(dtype="string")
"""Associates user utterances with the indices of subsequent tool invocations."""

df["tool_pair"] = "" # Initialize as empty string
grouped = df.groupby("eval_id")

for _, group in grouped:
user = group[
group["action_type"] == "User Utterance"
].index.tolist()
tool_list = group[
group["action_type"] == "Tool Invocation"
].index.tolist()
tool_indices = []
last_user_utterance_index = None

pairs = self.get_matching_list_idx(
user, tool_list
)
for index, row in group.iterrows():
if row["action_type"] == "User Utterance":
# Assign accumulated tool indices to the *previous* user utterance (if any)
if last_user_utterance_index is not None:
df.loc[last_user_utterance_index, "tool_pair"] = str(tool_indices)

# Create pairs of user/tool_list row indices
for pair in pairs:
df.loc[pair[0], "tool_pair"] = str(pair[1])
# Reset for the current user utterance:
tool_indices = [] # Clear the list for the current user utterance
last_user_utterance_index = index # Update the user utterance index

elif row["action_type"] == "Tool Invocation":
tool_indices.append(index)

# After processing the group, assign any remaining tool indices to the last user utterance
if last_user_utterance_index is not None and tool_indices:
df.loc[last_user_utterance_index, "tool_pair"] = str(tool_indices)

return df

Expand Down Expand Up @@ -836,4 +854,4 @@ def from_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
"""Load eval dataset from local premade dataframe."""
df = self.validate_and_prep_inputs(df)

return df
return df

0 comments on commit cb912be

Please sign in to comment.