From 9336fce0f22bb0d8128a4feb9407917bf6a611cb Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Mon, 18 Nov 2024 09:39:49 -0800 Subject: [PATCH] python[patch]: accept simple evaluators (#1200) can write evaluators like this: ```python from langsmith import evaluate def simp(inputs: dict, outputs: dict, reference_outputs: dict) -> dict: return {"results": [ {"score": inputs == outputs, "key": 'identity'}, {"score": outputs == reference_outputs, "key": "correct"} ]} evaluate( (lambda x: x), data="Sample Dataset 3", evaluators=[simp], ) ``` example experiment: left-tray-86 https://dev.smith.langchain.com/public/e7782ea0-3de5-4352-8cd4-7b2cdbb03e4c/d --------- Co-authored-by: William FH <13333726+hinthornw@users.noreply.github.com> --- python/langsmith/evaluation/_runner.py | 1 + python/langsmith/evaluation/evaluator.py | 78 ++++++++++- .../unit_tests/evaluation/test_runner.py | 121 ++++++++++++++---- 3 files changed, 175 insertions(+), 25 deletions(-) diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py index 8c2fe800c..8ed55f6bf 100644 --- a/python/langsmith/evaluation/_runner.py +++ b/python/langsmith/evaluation/_runner.py @@ -87,6 +87,7 @@ [schemas.Run, Optional[schemas.Example]], Union[EvaluationResult, EvaluationResults], ], + Callable[..., Union[dict, EvaluationResults, EvaluationResult]], ] AEVALUATOR_T = Union[ Callable[ diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py index 21e475d6c..feb0e95e4 100644 --- a/python/langsmith/evaluation/evaluator.py +++ b/python/langsmith/evaluation/evaluator.py @@ -194,6 +194,10 @@ def __init__( func (Callable): A function that takes a `Run` and an optional `Example` as arguments, and returns a dict or `ComparisonEvaluationResult`. """ + func = _normalize_evaluator_func(func) + if afunc: + afunc = _normalize_evaluator_func(afunc) # type: ignore[assignment] + wraps(func)(self) from langsmith import run_helpers # type: ignore @@ -288,7 +292,7 @@ def _format_result( elif isinstance(result, list): if not all(isinstance(x, dict) for x in result): raise ValueError( - f"Expected a list of dicts or EvaluationResult. Received {result}." + f"Expected a list of dicts or EvaluationResults. Received {result}." ) result = {"results": result} # type: ignore[misc] elif isinstance(result, str): @@ -645,3 +649,75 @@ def comparison_evaluator( ) -> DynamicComparisonRunEvaluator: """Create a comaprison evaluator from a function.""" return DynamicComparisonRunEvaluator(func) + + +def _normalize_evaluator_func( + func: Callable, +) -> Union[ + Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], + Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]], +]: + supported_args = ("run", "example", "inputs", "outputs", "reference_outputs") + sig = inspect.signature(func) + positional_args = [ + pname + for pname, p in sig.parameters.items() + if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY) + ] + if not positional_args or ( + not all(pname in supported_args for pname in positional_args) + and len(positional_args) != 2 + ): + msg = ( + f"Invalid evaluator function. Must have at least one positional " + f"argument. Supported positional arguments are {supported_args}. Please " + f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators" + # noqa: E501 + ) + raise ValueError(msg) + elif not all( + pname in supported_args for pname in positional_args + ) or positional_args == ["run", "example"]: + # For backwards compatibility we assume custom arg names are Run and Example + # types, respectively. + return func + else: + if inspect.iscoroutinefunction(func): + + async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: + arg_map = { + "run": run, + "example": example, + "inputs": example.inputs, + "outputs": run.outputs or {}, + "reference_outputs": example.outputs or {}, + } + args = (arg_map[arg] for arg in positional_args) + return await func(*args) + + awrapper.__name__ = ( + getattr(func, "__name__") + if hasattr(func, "__name__") + else awrapper.__name__ + ) + return awrapper # type: ignore[return-value] + + else: + + def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT: + arg_map = { + "run": run, + "example": example, + "inputs": example.inputs, + "outputs": run.outputs or {}, + "reference_outputs": example.outputs or {}, + } + args = (arg_map[arg] for arg in positional_args) + return func(*args) + + wrapper.__name__ = ( + getattr(func, "__name__") + if hasattr(func, "__name__") + else wrapper.__name__ + ) + return wrapper # type: ignore[return-value] diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py index 38cee0488..408d4508d 100644 --- a/python/tests/unit_tests/evaluation/test_runner.py +++ b/python/tests/unit_tests/evaluation/test_runner.py @@ -21,6 +21,7 @@ from langsmith.client import Client from langsmith.evaluation._arunner import aevaluate, aevaluate_existing from langsmith.evaluation._runner import evaluate_existing +from langsmith.evaluation.evaluator import _normalize_evaluator_func class FakeRequest: @@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8): raise TimeoutError("Condition not met") +def _create_example(idx: int) -> ls_schemas.Example: + return ls_schemas.Example( + id=uuid.uuid4(), + inputs={"in": idx}, + outputs={"answer": idx + 1}, + dataset_id="00886375-eb2a-4038-9032-efff60309896", + created_at=datetime.now(timezone.utc), + ) + + @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher") @pytest.mark.parametrize("blocking", [False, True]) @pytest.mark.parametrize("as_runnable", [False, True]) @@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None: ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - def _create_example(idx: int) -> ls_schemas.Example: - return ls_schemas.Example( - id=uuid.uuid4(), - inputs={"in": idx}, - outputs={"answer": idx + 1}, - dataset_id=ds_id, - created_at=datetime.now(timezone.utc), - ) - SPLIT_SIZE = 3 NUM_REPETITIONS = 4 ds_examples = [_create_example(i) for i in range(10)] @@ -196,6 +198,14 @@ def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + def score_unpacked_inputs_outputs(inputs, outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs): + ordering_of_stuff.append("evaluate") + return {"score": reference_outputs["answer"]} + def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -211,11 +221,20 @@ def eval_list(run, example): {"score": 1, "key": "list_eval_int"}, ] + evaluators = [ + score_value_first, + score_unpacked_inputs_outputs, + score_unpacked_inputs_outputs_reference, + eval_float, + eval_str, + eval_list, + ] + results = evaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first, eval_float, eval_str, eval_list], + evaluators=evaluators, num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -242,18 +261,19 @@ def eval_list(run, example): for r in results: assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1 # type: ignore assert set(r["run"].outputs.keys()) == {"output"} # type: ignore + assert len(r["evaluation_results"]["results"]) == len(evaluators) + 1 assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5) + _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1))) _wait_until(lambda: slow_index is not None) # Want it to be interleaved assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS - 1) * 5 + assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1) def score_value(run, example): return {"score": 0.7} @@ -291,6 +311,25 @@ def bad_eval_list(run, example): for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} + # test invalid evaluators + # args need to be positional + def eval1(*, inputs, outputs): + pass + + # if more than 2 positional args, they must all have default arg names + # (run, example, ...) + def eval2(x, y, inputs): + pass + + evaluators = [eval1, eval2] + + for eval_ in evaluators: + with pytest.raises(ValueError, match="Invalid evaluator function."): + _normalize_evaluator_func(eval_) + + with pytest.raises(ValueError, match="Invalid evaluator function."): + evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client) + def test_evaluate_raises_for_async(): async def my_func(inputs: dict): @@ -328,15 +367,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None: ds_name = "my-dataset" ds_id = "00886375-eb2a-4038-9032-efff60309896" - def _create_example(idx: int) -> ls_schemas.Example: - return ls_schemas.Example( - id=uuid.uuid4(), - inputs={"in": idx}, - outputs={"answer": idx + 1}, - dataset_id=ds_id, - created_at=datetime.now(timezone.utc), - ) - SPLIT_SIZE = 3 NUM_REPETITIONS = 4 ds_examples = [_create_example(i) for i in range(10)] @@ -397,6 +427,16 @@ async def score_value_first(run, example): ordering_of_stuff.append("evaluate") return {"score": 0.3} + async def score_unpacked_inputs_outputs(inputs, outputs): + ordering_of_stuff.append("evaluate") + return {"score": outputs["output"]} + + async def score_unpacked_inputs_outputs_reference( + inputs, outputs, reference_outputs + ): + ordering_of_stuff.append("evaluate") + return {"score": reference_outputs["answer"]} + async def eval_float(run, example): ordering_of_stuff.append("evaluate") return 0.2 @@ -412,11 +452,20 @@ async def eval_list(run, example): {"score": 1, "key": "list_eval_int"}, ] + evaluators = [ + score_value_first, + score_unpacked_inputs_outputs, + score_unpacked_inputs_outputs_reference, + eval_float, + eval_str, + eval_list, + ] + results = await aevaluate( predict, client=client, data=dev_split, - evaluators=[score_value_first, eval_float, eval_str, eval_list], + evaluators=evaluators, num_repetitions=NUM_REPETITIONS, blocking=blocking, ) @@ -452,14 +501,14 @@ async def eval_list(run, example): assert fake_request.created_session _wait_until(lambda: fake_request.runs) N_PREDS = SPLIT_SIZE * NUM_REPETITIONS - _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5) + _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1)) _wait_until(lambda: slow_index is not None) # Want it to be interleaved assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS assert slow_index is not None # It's delayed, so it'll be the penultimate event # Will run all other preds and evals, then this, then the last eval - assert slow_index == (N_PREDS - 1) * 5 + assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1) assert fake_request.created_session["name"] @@ -493,3 +542,27 @@ async def bad_eval_list(run, example): ) async for r in results: assert r["evaluation_results"]["results"][0].extra == {"error": True} + + # test invalid evaluators + # args need to be positional + async def eval1(*, inputs, outputs): + pass + + # if more than 2 positional args, they must all have default arg names + # (run, example, ...) + async def eval2(x, y, inputs): + pass + + evaluators = [eval1, eval2] + + async def atarget(x): + return x + + for eval_ in evaluators: + with pytest.raises(ValueError, match="Invalid evaluator function."): + _normalize_evaluator_func(eval_) + + with pytest.raises(ValueError, match="Invalid evaluator function."): + await aevaluate( + atarget, data=ds_examples, evaluators=[eval_], client=client + )