diff --git a/src/dfcx_scrapi/tools/evaluations.py b/src/dfcx_scrapi/tools/evaluations.py index ca5b0434..43787d9c 100644 --- a/src/dfcx_scrapi/tools/evaluations.py +++ b/src/dfcx_scrapi/tools/evaluations.py @@ -108,6 +108,7 @@ def __init__( generation_model=self.generation_model, embedding_model=self.embedding_model ) + self.unexpected_rows = [] if debug: logging.basicConfig(level=logging.DEBUG, force=True) @@ -171,33 +172,47 @@ def process_flow_invocations( @staticmethod def process_tool_invocations( - tool_responses: List[str], + tool_responses: List[Dict], index: int, row: pd.Series, - df: pd.DataFrame) -> pd.DataFrame: - # Check if our golden contained a tool_idx or wasn't - # expecting tools + df: pd.DataFrame, + ) -> pd.DataFrame: + """Process tool invocations and map them + to the correct rows in the dataframe.""" + # Get the list of indices where tool responses should be mapped if row["tool_pair"] in [None, "", "NaN", "nan"]: tool_index_list = [index] else: tool_index_list = literal_eval(row["tool_pair"]) - for idx in tool_index_list: - tool = tool_responses.pop(0) - df.loc[ - int(idx), - [ + # Process each tool response and map it to the corresponding index + for i, idx in enumerate(tool_index_list): + if i < len(tool_responses): + tool = tool_responses[i] + df.loc[int(idx), "res_tool_name"] = ( + tool.get("tool_name", "") + ) + df.loc[int(idx), "res_tool_action"] = ( + tool.get("tool_action", "") + ) + df.loc[int(idx), "res_input_params"] = ( + str(tool.get("input_params", {})) + ) + df.loc[int(idx), "res_output_params"] = ( + str(tool.get("output_params", {})) + ) + else: + df.loc[int(idx), [ "res_tool_name", "res_tool_action", "res_input_params", - "res_output_params", - ], - ] = [ - tool["tool_name"], - tool["tool_action"], - tool["input_params"], - tool["output_params"], - ] + "res_output_params" + ]] = [ + "NO_TOOL_RESPONSE", + "NO_TOOL_RESPONSE", + "NO_TOOL_RESPONSE", + "NO_TOOL_RESPONSE" + ] return df @@ -403,15 +418,30 @@ def run_detect_intent_queries(self, df: pd.DataFrame) -> pd.DataFrame: text=row["action_input"], parameters=session_parameters ) - # Add data to the existing row df.loc[index, ["session_id", "agent_id"]] = [ data["session_id"], data["agent_id"], ] text_res = self.ar._extract_text(res) - utterance_idx = int(row["utterance_pair"]) - df.loc[utterance_idx, ["agent_response"]] = [text_res] + + # Handle Agent Responses + if row["utterance_pair"] != "": + utterance_idx = int(row["utterance_pair"]) + df.loc[utterance_idx, ["agent_response"]] = [text_res] + + else: + # collect the data for inserting later + self.unexpected_rows.append( + { + "session_id": data["session_id"], + "agent_id": data["agent_id"], + "action_type": "UNEXPECTED Agent Response", + "index": index, + "column": "agent_response", + "data": text_res + } + ) # Handle Playbook Invocations playbook_responses = ( @@ -434,13 +464,37 @@ def run_detect_intent_queries(self, df: pd.DataFrame) -> pd.DataFrame: tool_responses = ( self.sessions_client.collect_tool_responses(res) ) - if len(tool_responses) > 0: + if tool_responses: # Only call if not empty df = self.process_tool_invocations( - tool_responses, index, row, df + tool_responses, + index, + row, + df ) return df + def insert_unexpected_rows(self, df: pd.DataFrame) -> pd.DataFrame: + """Insert any unexpected rows collected during runtime.""" + if self.unexpected_rows: + for row in reversed(self.unexpected_rows): + index = row["index"] + new_row = pd.DataFrame(columns=df.columns, index=[index]) + new_row["session_id"] = row["session_id"] + new_row["agent_id"] = row["agent_id"] + new_row["action_type"] = row["action_type"] + new_row[row["column"]] = row["data"] + df = pd.concat( + [ + df.iloc[:index], + new_row, + df.iloc[index:] + ]) + + df = df.sort_index() + + return df + def run_evals(self, df: pd.DataFrame) -> pd.DataFrame: print("Starting Evals...") @@ -449,9 +503,15 @@ def run_evals(self, df: pd.DataFrame) -> pd.DataFrame: return df - def run_query_and_eval(self, df: pd.DataFrame) -> pd.DataFrame: + def scrape_results(self, df: pd.DataFrame) -> pd.DataFrame: df = self.add_response_columns(df) df = self.run_detect_intent_queries(df) + df = self.insert_unexpected_rows(df) + + return df + + def run_query_and_eval(self, df: pd.DataFrame) -> pd.DataFrame: + df = self.scrape_results(df) df = self.run_evals(df) df = self.clean_outputs(df) @@ -602,27 +662,37 @@ def get_model_name(settings: types.GenerativeSettings) -> str: return model_map.get(model_name, "") - def pair_tool_calls(self, df: pd.DataFrame) -> pd.DataFrame: - "Identifies pairings of agent_utterance/tool_invocation by eval_id." + """Pairs user utterances with indices of relevant tool invocations.""" + df["tool_pair"] = pd.Series(dtype="string") grouped = df.groupby("eval_id") for _, group in grouped: - user = group[ - group["action_type"] == "User Utterance" - ].index.tolist() - tool_list = group[ - group["action_type"] == "Tool Invocation" - ].index.tolist() - - pairs = self.get_matching_list_idx( - user, tool_list - ) - - # Create pairs of user/tool_list row indices - for pair in pairs: - df.loc[pair[0], "tool_pair"] = str(pair[1]) + tool_indices = [] + last_user_utterance_index = None + + for index, row in group.iterrows(): + if row["action_type"] == "User Utterance": + # Assign accumulated tool indices to + # the *previous* user utterance (if any) + if last_user_utterance_index is not None: + df.loc[last_user_utterance_index, "tool_pair"] = ( + str(tool_indices) + ) + # Reset for the current user utterance: + tool_indices = [] + last_user_utterance_index = index + + elif row["action_type"] == "Tool Invocation": + tool_indices.append(index) + + # After processing the group, assign any remaining + # tool indices to the last user utterance + if last_user_utterance_index is not None and tool_indices: + df.loc[last_user_utterance_index, "tool_pair"] = ( + str(tool_indices) + ) return df