Skip to content

Commit

Permalink
python[patch]: accept simple evaluators (#1200)
Browse files Browse the repository at this point in the history
can write evaluators like this:

```python
from langsmith import evaluate

def simp(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    return {"results": [
        {"score": inputs == outputs, "key": 'identity'}, 
        {"score": outputs == reference_outputs, "key": "correct"}
    ]}

evaluate(
    (lambda x: x),
    data="Sample Dataset 3",
    evaluators=[simp],
)
```

example experiment: left-tray-86
https://dev.smith.langchain.com/public/e7782ea0-3de5-4352-8cd4-7b2cdbb03e4c/d

---------

Co-authored-by: William FH <[email protected]>
  • Loading branch information
baskaryan and hinthornw authored Nov 18, 2024
1 parent 47671be commit 9336fce
Show file tree
Hide file tree
Showing 3 changed files with 175 additions and 25 deletions.
1 change: 1 addition & 0 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
[schemas.Run, Optional[schemas.Example]],
Union[EvaluationResult, EvaluationResults],
],
Callable[..., Union[dict, EvaluationResults, EvaluationResult]],
]
AEVALUATOR_T = Union[
Callable[
Expand Down
78 changes: 77 additions & 1 deletion python/langsmith/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ def __init__(
func (Callable): A function that takes a `Run` and an optional `Example` as
arguments, and returns a dict or `ComparisonEvaluationResult`.
"""
func = _normalize_evaluator_func(func)
if afunc:
afunc = _normalize_evaluator_func(afunc) # type: ignore[assignment]

wraps(func)(self)
from langsmith import run_helpers # type: ignore

Expand Down Expand Up @@ -288,7 +292,7 @@ def _format_result(
elif isinstance(result, list):
if not all(isinstance(x, dict) for x in result):
raise ValueError(
f"Expected a list of dicts or EvaluationResult. Received {result}."
f"Expected a list of dicts or EvaluationResults. Received {result}."
)
result = {"results": result} # type: ignore[misc]
elif isinstance(result, str):
Expand Down Expand Up @@ -645,3 +649,75 @@ def comparison_evaluator(
) -> DynamicComparisonRunEvaluator:
"""Create a comaprison evaluator from a function."""
return DynamicComparisonRunEvaluator(func)


def _normalize_evaluator_func(
func: Callable,
) -> Union[
Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
]:
supported_args = ("run", "example", "inputs", "outputs", "reference_outputs")
sig = inspect.signature(func)
positional_args = [
pname
for pname, p in sig.parameters.items()
if p.kind in (p.POSITIONAL_OR_KEYWORD, p.POSITIONAL_ONLY)
]
if not positional_args or (
not all(pname in supported_args for pname in positional_args)
and len(positional_args) != 2
):
msg = (
f"Invalid evaluator function. Must have at least one positional "
f"argument. Supported positional arguments are {supported_args}. Please "
f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
# noqa: E501
)
raise ValueError(msg)
elif not all(
pname in supported_args for pname in positional_args
) or positional_args == ["run", "example"]:
# For backwards compatibility we assume custom arg names are Run and Example
# types, respectively.
return func
else:
if inspect.iscoroutinefunction(func):

async def awrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
arg_map = {
"run": run,
"example": example,
"inputs": example.inputs,
"outputs": run.outputs or {},
"reference_outputs": example.outputs or {},
}
args = (arg_map[arg] for arg in positional_args)
return await func(*args)

awrapper.__name__ = (
getattr(func, "__name__")
if hasattr(func, "__name__")
else awrapper.__name__
)
return awrapper # type: ignore[return-value]

else:

def wrapper(run: Run, example: Example) -> _RUNNABLE_OUTPUT:
arg_map = {
"run": run,
"example": example,
"inputs": example.inputs,
"outputs": run.outputs or {},
"reference_outputs": example.outputs or {},
}
args = (arg_map[arg] for arg in positional_args)
return func(*args)

wrapper.__name__ = (
getattr(func, "__name__")
if hasattr(func, "__name__")
else wrapper.__name__
)
return wrapper # type: ignore[return-value]
121 changes: 97 additions & 24 deletions python/tests/unit_tests/evaluation/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from langsmith.client import Client
from langsmith.evaluation._arunner import aevaluate, aevaluate_existing
from langsmith.evaluation._runner import evaluate_existing
from langsmith.evaluation.evaluator import _normalize_evaluator_func


class FakeRequest:
Expand Down Expand Up @@ -120,6 +121,16 @@ def _wait_until(condition: Callable, timeout: int = 8):
raise TimeoutError("Condition not met")


def _create_example(idx: int) -> ls_schemas.Example:
return ls_schemas.Example(
id=uuid.uuid4(),
inputs={"in": idx},
outputs={"answer": idx + 1},
dataset_id="00886375-eb2a-4038-9032-efff60309896",
created_at=datetime.now(timezone.utc),
)


@pytest.mark.skipif(sys.version_info < (3, 9), reason="requires python3.9 or higher")
@pytest.mark.parametrize("blocking", [False, True])
@pytest.mark.parametrize("as_runnable", [False, True])
Expand All @@ -128,15 +139,6 @@ def test_evaluate_results(blocking: bool, as_runnable: bool) -> None:
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"

def _create_example(idx: int) -> ls_schemas.Example:
return ls_schemas.Example(
id=uuid.uuid4(),
inputs={"in": idx},
outputs={"answer": idx + 1},
dataset_id=ds_id,
created_at=datetime.now(timezone.utc),
)

SPLIT_SIZE = 3
NUM_REPETITIONS = 4
ds_examples = [_create_example(i) for i in range(10)]
Expand Down Expand Up @@ -196,6 +198,14 @@ def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}

def score_unpacked_inputs_outputs(inputs, outputs):
ordering_of_stuff.append("evaluate")
return {"score": outputs["output"]}

def score_unpacked_inputs_outputs_reference(inputs, outputs, reference_outputs):
ordering_of_stuff.append("evaluate")
return {"score": reference_outputs["answer"]}

def eval_float(run, example):
ordering_of_stuff.append("evaluate")
return 0.2
Expand All @@ -211,11 +221,20 @@ def eval_list(run, example):
{"score": 1, "key": "list_eval_int"},
]

evaluators = [
score_value_first,
score_unpacked_inputs_outputs,
score_unpacked_inputs_outputs_reference,
eval_float,
eval_str,
eval_list,
]

results = evaluate(
predict,
client=client,
data=dev_split,
evaluators=[score_value_first, eval_float, eval_str, eval_list],
evaluators=evaluators,
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
Expand All @@ -242,18 +261,19 @@ def eval_list(run, example):
for r in results:
assert r["run"].outputs["output"] == r["example"].inputs["in"] + 1 # type: ignore
assert set(r["run"].outputs.keys()) == {"output"} # type: ignore
assert len(r["evaluation_results"]["results"]) == len(evaluators) + 1

assert fake_request.created_session
_wait_until(lambda: fake_request.runs)
N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
_wait_until(lambda: len(ordering_of_stuff) == (N_PREDS * (len(evaluators) + 1)))
_wait_until(lambda: slow_index is not None)
# Want it to be interleaved
assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS

# It's delayed, so it'll be the penultimate event
# Will run all other preds and evals, then this, then the last eval
assert slow_index == (N_PREDS - 1) * 5
assert slow_index == (len(evaluators) + 1) * (N_PREDS - 1)

def score_value(run, example):
return {"score": 0.7}
Expand Down Expand Up @@ -291,6 +311,25 @@ def bad_eval_list(run, example):
for r in results:
assert r["evaluation_results"]["results"][0].extra == {"error": True}

# test invalid evaluators
# args need to be positional
def eval1(*, inputs, outputs):
pass

# if more than 2 positional args, they must all have default arg names
# (run, example, ...)
def eval2(x, y, inputs):
pass

evaluators = [eval1, eval2]

for eval_ in evaluators:
with pytest.raises(ValueError, match="Invalid evaluator function."):
_normalize_evaluator_func(eval_)

with pytest.raises(ValueError, match="Invalid evaluator function."):
evaluate((lambda x: x), data=ds_examples, evaluators=[eval_], client=client)


def test_evaluate_raises_for_async():
async def my_func(inputs: dict):
Expand Down Expand Up @@ -328,15 +367,6 @@ async def test_aevaluate_results(blocking: bool, as_runnable: bool) -> None:
ds_name = "my-dataset"
ds_id = "00886375-eb2a-4038-9032-efff60309896"

def _create_example(idx: int) -> ls_schemas.Example:
return ls_schemas.Example(
id=uuid.uuid4(),
inputs={"in": idx},
outputs={"answer": idx + 1},
dataset_id=ds_id,
created_at=datetime.now(timezone.utc),
)

SPLIT_SIZE = 3
NUM_REPETITIONS = 4
ds_examples = [_create_example(i) for i in range(10)]
Expand Down Expand Up @@ -397,6 +427,16 @@ async def score_value_first(run, example):
ordering_of_stuff.append("evaluate")
return {"score": 0.3}

async def score_unpacked_inputs_outputs(inputs, outputs):
ordering_of_stuff.append("evaluate")
return {"score": outputs["output"]}

async def score_unpacked_inputs_outputs_reference(
inputs, outputs, reference_outputs
):
ordering_of_stuff.append("evaluate")
return {"score": reference_outputs["answer"]}

async def eval_float(run, example):
ordering_of_stuff.append("evaluate")
return 0.2
Expand All @@ -412,11 +452,20 @@ async def eval_list(run, example):
{"score": 1, "key": "list_eval_int"},
]

evaluators = [
score_value_first,
score_unpacked_inputs_outputs,
score_unpacked_inputs_outputs_reference,
eval_float,
eval_str,
eval_list,
]

results = await aevaluate(
predict,
client=client,
data=dev_split,
evaluators=[score_value_first, eval_float, eval_str, eval_list],
evaluators=evaluators,
num_repetitions=NUM_REPETITIONS,
blocking=blocking,
)
Expand Down Expand Up @@ -452,14 +501,14 @@ async def eval_list(run, example):
assert fake_request.created_session
_wait_until(lambda: fake_request.runs)
N_PREDS = SPLIT_SIZE * NUM_REPETITIONS
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * 5)
_wait_until(lambda: len(ordering_of_stuff) == N_PREDS * (len(evaluators) + 1))
_wait_until(lambda: slow_index is not None)
# Want it to be interleaved
assert ordering_of_stuff[:N_PREDS] != ["predict"] * N_PREDS
assert slow_index is not None
# It's delayed, so it'll be the penultimate event
# Will run all other preds and evals, then this, then the last eval
assert slow_index == (N_PREDS - 1) * 5
assert slow_index == (N_PREDS - 1) * (len(evaluators) + 1)

assert fake_request.created_session["name"]

Expand Down Expand Up @@ -493,3 +542,27 @@ async def bad_eval_list(run, example):
)
async for r in results:
assert r["evaluation_results"]["results"][0].extra == {"error": True}

# test invalid evaluators
# args need to be positional
async def eval1(*, inputs, outputs):
pass

# if more than 2 positional args, they must all have default arg names
# (run, example, ...)
async def eval2(x, y, inputs):
pass

evaluators = [eval1, eval2]

async def atarget(x):
return x

for eval_ in evaluators:
with pytest.raises(ValueError, match="Invalid evaluator function."):
_normalize_evaluator_func(eval_)

with pytest.raises(ValueError, match="Invalid evaluator function."):
await aevaluate(
atarget, data=ds_examples, evaluators=[eval_], client=client
)

0 comments on commit 9336fce

Please sign in to comment.