python[patch]: accept simple evaluators (#1200)

can write evaluators like this: ```python from langsmith import evaluate def simp(inputs: dict, outputs: dict, reference_outputs: dict) -> dict: return {"results": [ {"score": inputs == outputs, "key": 'identity'}, {"score": outputs == reference_outputs, "key": "correct"} ]} evaluate( (lambda x: x), data="Sample Dataset 3", evaluators=[simp], ) ``` example experiment: left-tray-86 https://dev.smith.langchain.com/public/e7782ea0-3de5-4352-8cd4-7b2cdbb03e4c/d --------- Co-authored-by: William FH <[email protected]>
langchain-ai · Nov 18, 2024 · 9336fce · 9336fce
1 parent 47671be
commit 9336fce
Show file tree

Hide file tree

Showing 3 changed files with 175 additions and 25 deletions.
diff --git a/python/langsmith/evaluation/_runner.py b/python/langsmith/evaluation/_runner.py
@@ -87,6 +87,7 @@
         [schemas.Run, Optional[schemas.Example]],
         Union[EvaluationResult, EvaluationResults],
     ],
+    Callable[..., Union[dict, EvaluationResults, EvaluationResult]],
 ]
 AEVALUATOR_T = Union[
     Callable[

diff --git a/python/langsmith/evaluation/evaluator.py b/python/langsmith/evaluation/evaluator.py
@@ -194,6 +194,10 @@ def __init__(
             func (Callable): A function that takes a `Run` and an optional `Example` as
             arguments, and returns a dict or `ComparisonEvaluationResult`.
         """
+        func = _normalize_evaluator_func(func)
+        if afunc:
+            afunc = _normalize_evaluator_func(afunc)  # type: ignore[assignment]
+
         wraps(func)(self)
         from langsmith import run_helpers  # type: ignore
 
@@ -288,7 +292,7 @@ def _format_result(
         elif isinstance(result, list):
             if not all(isinstance(x, dict) for x in result):
                 raise ValueError(
-                    f"Expected a list of dicts or EvaluationResult. Received {result}."
+                    f"Expected a list of dicts or EvaluationResults. Received {result}."
                 )
             result = {"results": result}  # type: ignore[misc]
         elif isinstance(result, str):
@@ -645,3 +649,75 @@ def comparison_evaluator(
 ) -> DynamicComparisonRunEvaluator:
     """Create a comaprison evaluator from a function."""
     return DynamicComparisonRunEvaluator(func)
+
+
+def _normalize_evaluator_func(
+    func: Callable,
+) -> Union[
+    Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
+    Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
+]:
+    supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
+    sig = inspect.signature(func)
+    positional_args = [
+        pname
+        for pname, p in sig.parameters.items()
+        if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
+    ]
+    if not positional_args or (
+        not all(pname in supported_args for pname in positional_args)
+        and len(positional_args) != 2
+    ):
+        msg = (
+            f"Invalid evaluator function. Must have at least one positional "
+            f"argument. Supported positional arguments are {supported_args}. Please "
+            f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
+            # noqa: E501
+        )
+        raise ValueError(msg)
+    elif not all(
+        pname in supported_args for pname in positional_args
+    ) or positional_args == ["run", "example"]:
+        # For backwards compatibility we assume custom arg names are Run and Example
+        # types, respectively.
+        return func
+    else:
+        if inspect.iscoroutinefunction(func):
+
+            async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return await func(*args)
+
+            awrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else awrapper.__name__
+            )
+            return awrapper  # type: ignore[return-value]
+
+        else:
+
+            def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
+                arg_map = {
+                    "run": run,
+                    "example": example,
+                    "inputs": example.inputs,
+                    "outputs": run.outputs or {},
+                    "reference_outputs": example.outputs or {},
+                }
+                args = (arg_map[arg] for arg in positional_args)
+                return func(*args)
+
+            wrapper.__name__ = (
+                getattr(func, "__name__")
+                if hasattr(func, "__name__")
+                else wrapper.__name__
+            )
+            return wrapper  # type: ignore[return-value]
diff --git a/python/tests/unit_tests/evaluation/test_runner.py b/python/tests/unit_tests/evaluation/test_runner.py
@@ -21,6 +21,7 @@
 from langsmith.client import Client
 from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
 from langsmith.evaluation._runner import evaluate_existing
+from langsmith.evaluation.evaluator import _normalize_evaluator_func
 
 
 class FakeRequest:
@@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8):
     raise TimeoutError("Condition not met")
 
 
+def _create_example(idx: int) -> ls_schemas.Example:
+    return ls_schemas.Example(
+        id=uuid.uuid4(),
+        inputs={"in": idx},
+        outputs={"answer": idx + 1},
+        dataset_id="00886375-eb2a-4038-9032-efff60309896",
+        created_at=datetime.now(timezone.utc),
+    )
+
+
 @pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
 @pytest.mark.parametrize("blocking", [False, True])
 @pytest.mark.parametrize("as_runnable", [False, True])
@@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -196,6 +198,14 @@ def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    def score_unpacked_inputs_outputs(inputs, outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
     def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -211,11 +221,20 @@ def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+        eval_float,
+        eval_str,
+        eval_list,
+    ]
+
     results = evaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first, eval_float, eval_str, eval_list],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -242,18 +261,19 @@ def eval_list(run, example):
     for r in results:
         assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1  # type: ignore
         assert set(r["run"].outputs.keys()) == {"output"}  # type: ignore
+        assert len(r["evaluation_results"]["results"]) == len(evaluators) + 1
 
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
+    _wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
     assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
 
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS - 1) * 5
+    assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1)
 
     def score_value(run, example):
         return {"score": 0.7}
@@ -291,6 +311,25 @@ def bad_eval_list(run, example):
     for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
 
+    # test invalid evaluators
+    # args need to be positional
+    def eval1(*, inputs, outputs):
+        pass
+
+    # if more than 2 positional args, they must all have default arg names
+    # (run, example, ...)
+    def eval2(x, y, inputs):
+        pass
+
+    evaluators = [eval1, eval2]
+
+    for eval_ in evaluators:
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            _normalize_evaluator_func(eval_)
+
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client)
+
 
 def test_evaluate_raises_for_async():
     async def my_func(inputs: dict):
@@ -328,15 +367,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
     ds_name = "my-dataset"
     ds_id = "00886375-eb2a-4038-9032-efff60309896"
 
-    def _create_example(idx: int) -> ls_schemas.Example:
-        return ls_schemas.Example(
-            id=uuid.uuid4(),
-            inputs={"in": idx},
-            outputs={"answer": idx + 1},
-            dataset_id=ds_id,
-            created_at=datetime.now(timezone.utc),
-        )
-
     SPLIT_SIZE = 3
     NUM_REPETITIONS = 4
     ds_examples = [_create_example(i) for i in range(10)]
@@ -397,6 +427,16 @@ async def score_value_first(run, example):
         ordering_of_stuff.append("evaluate")
         return {"score": 0.3}
 
+    async def score_unpacked_inputs_outputs(inputs, outputs):
+        ordering_of_stuff.append("evaluate")
+        return {"score": outputs["output"]}
+
+    async def score_unpacked_inputs_outputs_reference(
+        inputs, outputs, reference_outputs
+    ):
+        ordering_of_stuff.append("evaluate")
+        return {"score": reference_outputs["answer"]}
+
     async def eval_float(run, example):
         ordering_of_stuff.append("evaluate")
         return 0.2
@@ -412,11 +452,20 @@ async def eval_list(run, example):
             {"score": 1, "key": "list_eval_int"},
         ]
 
+    evaluators = [
+        score_value_first,
+        score_unpacked_inputs_outputs,
+        score_unpacked_inputs_outputs_reference,
+        eval_float,
+        eval_str,
+        eval_list,
+    ]
+
     results = await aevaluate(
         predict,
         client=client,
         data=dev_split,
-        evaluators=[score_value_first, eval_float, eval_str, eval_list],
+        evaluators=evaluators,
         num_repetitions=NUM_REPETITIONS,
         blocking=blocking,
     )
@@ -452,14 +501,14 @@ async def eval_list(run, example):
     assert fake_request.created_session
     _wait_until(lambda: fake_request.runs)
     N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
-    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
+    _wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
     _wait_until(lambda: slow_index is not None)
     # Want it to be interleaved
     assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
     assert slow_index is not None
     # It's delayed, so it'll be the penultimate event
     # Will run all other preds and evals, then this, then the last eval
-    assert slow_index == (N_PREDS - 1) * 5
+    assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)
 
     assert fake_request.created_session["name"]
 
@@ -493,3 +542,27 @@ async def bad_eval_list(run, example):
     )
     async for r in results:
         assert r["evaluation_results"]["results"][0].extra == {"error": True}
+
+    # test invalid evaluators
+    # args need to be positional
+    async def eval1(*, inputs, outputs):
+        pass
+
+    # if more than 2 positional args, they must all have default arg names
+    # (run, example, ...)
+    async def eval2(x, y, inputs):
+        pass
+
+    evaluators = [eval1, eval2]
+
+    async def atarget(x):
+        return x
+
+    for eval_ in evaluators:
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            _normalize_evaluator_func(eval_)
+
+        with pytest.raises(ValueError, match="Invalid evaluator function."):
+            await aevaluate(
+                atarget, data=ds_examples, evaluators=[eval_], client=client
+            )