diff --git a/examples/pytest/README.md b/examples/pytest/README.md index 732e0cb0..2af34ac1 100644 --- a/examples/pytest/README.md +++ b/examples/pytest/README.md @@ -128,7 +128,9 @@ E.g. we only pass tests if all the outputs are as expected, or we pass if 80% of also log this to a file, or a database, etc. for further inspection and record keeping, or combining it with open source frameworks [mlflow](https://mlflow.org) and using their [evaluate functionality](https://mlflow.org/docs/latest/llms/llm-evaluate/index.html). -Note: we can also combine `results_bag` with ``pytest.mark.parametrize`` to run the same test with different inputs and expected outputs: +Note (1): if you want to build your own way to aggregate, you can see another way using a very simple fixture in `conftest.py`. + +Note (2) : we can also combine `results_bag` with ``pytest.mark.parametrize`` to run the same test with different inputs and expected outputs: ```python import pytest @@ -298,9 +300,11 @@ def test_an_agent_e2e_with_tracker(input_state, expected_state, results_bag, tra # An example Here in this directory we have: - - `some_actions.py` - a file that defines an augmented LLM application (it's not a full agent) with some actions + - `some_actions.py` - a file that defines an augmented LLM application (it's not a full agent) with some actions. See image below - note the hypotheses action runs multiple in parallel. - `test_some_actions.py` - a file that defines some tests for the actions in `some_actions.py`. +![toy example](diagnosis.png) + You'll see that we use the `results_bag` fixture to log the results of our tests,and then we can access these results via the `module_results_df` fixture that provides a pandas dataframe of the results. This dataframe is then saved as a CSV for uploading to google sheets, etc. for further analysis. You will also see uses of `pytest.mark.parametrize` diff --git a/examples/pytest/some_actions.py b/examples/pytest/some_actions.py index 9e3ac277..b878ec3f 100644 --- a/examples/pytest/some_actions.py +++ b/examples/pytest/some_actions.py @@ -1,3 +1,8 @@ +""" +This is an example module that defines a Burr application. + +It hypothetically transcribes audio and then runs a hypothesis on the transcription to determine a medical diagnosis. +""" from typing import Any, Callable, Dict, Generator, List, Tuple import openai @@ -10,12 +15,14 @@ @action(reads=["audio"], writes=["transcription"]) def transcribe_audio(state: State) -> State: + """Action to transcribe audio.""" # here we fake transcription. For this example the audio is text already... return state.update(transcription=state["audio"]) @action(reads=["hypothesis", "transcription"], writes=["diagnosis"]) def run_hypothesis(state: State) -> State: + """Action to run a hypothesis on a transcription.""" client = openai.Client() # here for simplicity because clients and SERDE don't mix well. hypothesis = state["hypothesis"] transcription = state["transcription"] @@ -42,15 +49,20 @@ def run_hypothesis(state: State) -> State: class TestMultipleHypotheses(MapStates): + """Parallel action to test multiple hypotheses.""" + def action(self, state: State, inputs: Dict[str, Any]) -> Action | Callable | RunnableGraph: + """which action to run for each state.""" return run_hypothesis def states( self, state: State, context: ApplicationContext, inputs: Dict[str, Any] ) -> Generator[State, None, None]: - # You could easily have a list_hypotheses upstream action that writes to "hypothesis" in state - # And loop through those - # This hardcodes for simplicity + """Generate the states to run the action on. + You could easily have a list_hypotheses upstream action that writes to "hypothesis" in state + And loop through those + This hardcodes for simplicity + """ for hypothesis in [ "Common cold", "Sprained ankle", @@ -59,6 +71,7 @@ def states( yield state.update(hypothesis=hypothesis) def reduce(self, state: State, states: Generator[State, None, None]) -> State: + """Combine the outputs of the parallel action.""" all_diagnoses_outputs = [] for _sub_state in states: all_diagnoses_outputs.append( @@ -77,6 +90,7 @@ def writes(self) -> List[str]: @action(reads=["diagnosis_outputs"], writes=["final_diagnosis"]) def determine_diagnosis(state: State) -> State: + """Action to determine the final diagnosis.""" # could also get an LLM to decide here, or have a human decide, etc. possible_hypotheses = [d for d in state["diagnosis_outputs"] if d["diagnosis"].lower() == "yes"] if len(possible_hypotheses) == 1: @@ -90,6 +104,7 @@ def determine_diagnosis(state: State) -> State: def build_graph() -> core.Graph: + """Builds the graph for the application""" graph = ( GraphBuilder() .with_actions( @@ -115,7 +130,17 @@ def build_application( tracker, use_otel_tracing: bool = False, ) -> core.Application: - """Builds an application with the given parameters.""" + """Builds an application with the given parameters. + + :param app_id: + :param graph: + :param initial_state: + :param initial_entrypoint: + :param partition_key: + :param tracker: + :param use_otel_tracing: + :return: + """ app_builder = ( core.ApplicationBuilder() .with_graph(graph) @@ -132,6 +157,13 @@ def build_application( def run_my_agent( input_audio: str, partition_key: str = None, app_id: str = None, tracking_project: str = None ) -> Tuple[str, str]: + """Runs the agent with the given input audio (in this case a string transcription...). + :param input_audio: we fake it here, and ask for a string... + :param partition_key: + :param app_id: + :param tracking_project: + :return: + """ graph = build_graph() tracker = None if tracking_project: diff --git a/examples/pytest/test_some_actions.py b/examples/pytest/test_some_actions.py index 9af90d42..d0b131ba 100644 --- a/examples/pytest/test_some_actions.py +++ b/examples/pytest/test_some_actions.py @@ -158,6 +158,9 @@ def test_run_hypothesis_burr_fixture_e2e_with_tracker(input_state, expected_stat def test_print_results(module_results_df): + """This is an example using pytest-harvest to return results to a central location. + You could use other plugins, or create your own fixtures (e.g. see conftest.py for a simpler custom fixture). + """ print(module_results_df.columns) print(module_results_df.head()) # compute statistics