Add datasets for tool usage tasks (#46)

Add datasets
langchain-ai · Nov 21, 2023 · 7253b43 · 7253b43
1 parent fd0203c
commit 7253b43
Show file tree

Hide file tree

Showing 4 changed files with 349 additions and 48 deletions.
diff --git a/langchain_benchmarks/tool_usage/tasks/multiverse_math.py b/langchain_benchmarks/tool_usage/tasks/multiverse_math.py
@@ -8,6 +8,34 @@
 The modified operations should yield different results, but still retain
 appropriate properties. For example, the modified multiplication operation
 should still be commutative.
+
+Please note that the modified operations are not guaranteed to even make sense
+in the real world since not all properties will be retained (e.g.,
+distributive property).
+
+For example,
+
+I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?
+
+One would expect the answer to be 21, but in this universe, the answer is 32.34.
+
+In addition, it depends on how the operations are grouped:
+
+(1 + 2) * 7 = 32.34
+
+But:
+
+1 * 7 + 2 * 7 = 24.3
+
+Due to these changes certain questions are not allowed as inputs as they
+would yield different results if evaluated in different ways.
+
+For example, "convert 15 degrees to radians" is not allowed as an input
+as it could be interpreted as either:
+
+divide(multiply(15, pi()), 180)
+or
+multiply(divide(15, 180), pi())
 """
 import math
 from typing import List, cast
@@ -25,14 +53,24 @@ def multiply(a: float, b: float) -> float:
 def divide(a: float, b: float) -> float:
     """Divide two numbers; a / b."""
     # Division is neither commutative nor associative
-    return a / (b + 0.5)
+    return 0.5 * a / b
 
 
 def add(a: float, b: float) -> float:
     """Add two numbers; a + b."""
     return a + b + 1.2
 
 
+def sin(radians: float) -> float:
+    """The sine of an angle in radians."""
+    return math.cos(radians)
+
+
+def cos(radians: float) -> float:
+    """The cosine of an angle in radians."""
+    return math.sin(radians)
+
+
 def subtract(a: float, b: float) -> float:
     """Subtract two numbers; a - b."""
     return a - b - 3
@@ -45,42 +83,43 @@ def power(a: float, b: float) -> float:
 
 def log(a: float, base: float) -> float:
     """Take the log of a number; log(a, base)."""
-    return math.log(a, base + 1.5)
+    # Force the base to always be positive -- hard to predict what will happen
+    # in this universe :)
+    return math.log(a, abs(base + 1.5))
+
+
+def pi() -> float:
+    """Returns a precise value of PI for this alternate universe."""
+    return math.e
 
 
 def negate(a: float) -> float:
     """Negate a number; -a."""
     return a  # negation does not negate the number
 
 
-# Temporary dataset
-DATASET = [
-    # 2-tuple format of (question, answer)
-    ("Add 2 and 3", add(2, 3)),
-    ("Subtract 3 from 2", subtract(2, 3)),
-    (
-        "I ate 1 apple and 2 oranges every day for 7 days. How many fruits did I eat?",
-        multiply(7, add(1, 2)),
-    ),
-    (
-        "what is the result of 2 to the power of 3?",
-        power(2, 3),
-    ),
-    (
-        "calculate sqrt of 101 to 4 digits of precision",
-        round(power(101, 0.4), 4),
-    ),
-]
-
-
 # PUBLIC API
 
 
 def get_environment() -> ToolUsageEnvironment:
     """Create an environment."""
     tools = cast(
         List[BaseTool],
-        [tool(func) for func in [multiply, add, divide, subtract, power, log, negate]],
+        [
+            tool(func)
+            for func in [
+                multiply,
+                add,
+                divide,
+                subtract,
+                power,
+                log,
+                negate,
+                sin,
+                cos,
+                pi,
+            ]
+        ],
     )
     return ToolUsageEnvironment(
         tools=tools,
@@ -90,15 +129,15 @@ def get_environment() -> ToolUsageEnvironment:
 
 MULTIVERSE_MATH = ToolUsageTask(
     name="Multiverse Math",
-    dataset_id="placeholder",
+    dataset_id="https://smith.langchain.com/public/3f2b389f-dcb3-4669-8174-1be4f0a86576/d",
     create_environment=get_environment,
     instructions=(
         "You are requested to solve math questions in an alternate "
-        "mathematical universe. The rules of association, commutativity, "
-        "and distributivity still apply, but the operations have been "
-        "altered to yield different results than expected. Solve the "
-        "given math questions using the provided tools. "
-        "Do not guess the answer."
+        "mathematical universe. The operations have been altered to yield "
+        "different results than expected. Do not guess the answer or rely on your "
+        " innate knowledge of math. Use the provided tools to answer the question. "
+        "While associativity and commutativity apply, distributivity does not. Answer "
+        "the question using the fewest possible tools."
     ),
     description=(
         """\
@@ -113,3 +152,90 @@ def get_environment() -> ToolUsageEnvironment:
 """
     ),
 )
+
+# Source dataset used to create the public dataset in LangSmith
+DATASET = [
+    {
+        "question": "Add 2 and 3",
+        "answer": add(2, 3),
+        "expected_steps": ["add"],
+    },
+    {
+        "question": "Subtract 3 from 2",
+        "answer": subtract(2, 3),
+        "expected_steps": ["subtract"],
+    },
+    {
+        "question": "What is -5 if evaluated using the negate function?",
+        "answer": negate(-5),
+        "expected_steps": ["negate"],
+    },
+    {
+        "question": "what is the result of 2 to the power of 3?",
+        "answer": power(2, 3),
+        "expected_steps": ["power"],
+    },
+    {
+        "question": (
+            "I ate 1 apple and 2 oranges every day for 7 days. "
+            "How many fruits did I eat?"
+        ),
+        "answer": multiply(7, add(1, 2)),
+        "expected_steps": ["multiply", "add"],
+    },
+    {
+        "question": "multiply the result of (log of 100 to base 10) by 3",
+        "answer": multiply(log(100, 10), 3),
+        "expected_steps": ["log", "multiply"],
+    },
+    {
+        "question": "calculate sqrt of 101 to 4 digits of precision",
+        "answer": round(power(101, 0.4), 4),
+        "expected_steps": ["power", "round"],
+    },
+    {
+        "question": (
+            "ecoli divides every 20 minutes. How many cells will be "
+            "there after 2 hours if we start with 5 cells?"
+        ),
+        "answer": multiply(5, power(2, divide(120, 20))),
+        "expected_steps": ["divide", "power", "multiply"],
+    },
+    {
+        "question": (
+            "after calculating the sin of 1.5 radians, divide "
+            "the result by cos of 1.5 radians"
+        ),
+        "answer": sin(1.5) / cos(1.5),
+        "expected_steps": ["sin", "cos", "divide"],
+    },
+    {
+        "question": "convert 15 degrees to radians",
+        "answer": divide(multiply(15, pi()), 180),
+        "expected_steps": ["pi", "multiply", "divide"],
+    },
+]
+
+
+def _create_dataset() -> None:
+    """Create a dataset with the langsmith client."""
+    from langsmith.client import Client
+
+    client = Client()
+
+    dataset = client.create_dataset(
+        dataset_name=MULTIVERSE_MATH.name,
+        description=MULTIVERSE_MATH.description,
+    )
+
+    for example in DATASET:
+        client.create_example(
+            inputs={
+                "question": example["question"],
+            },
+            outputs={
+                "output": example["answer"],
+                "expected_steps": example["expected_steps"],
+            },
+            dataset_id=dataset.id,
+        )
diff --git a/langchain_benchmarks/tool_usage/tasks/type_writer.py b/langchain_benchmarks/tool_usage/tasks/type_writer.py
@@ -18,11 +18,13 @@ class Paper:
     content: str
 
 
-def function(paper: Paper) -> Callable[[], str]:
+def create_typer(paper: Paper) -> Callable[[], str]:
     """Create a function that types the given letter."""
 
     def type_letter(letter: str) -> str:
         """Print the given letter on the paper."""
+        if len(letter) != 1:
+            return "ERROR: The letter must be a single character."
         paper.content += letter
         return "OK"
 
@@ -47,7 +49,7 @@ def _read_state() -> Any:
         """Read the state of the environment."""
         return paper.content
 
-    tools = cast(List[BaseTool], [tool(function(paper))])
+    tools = cast(List[BaseTool], [tool(create_typer(paper))])
 
     return ToolUsageEnvironment(
         tools=tools,
@@ -56,22 +58,21 @@ def _read_state() -> Any:
 
 
 TYPE_WRITER_TASK = ToolUsageTask(
-    name="Tool Usage - Typewriter (1 func)",
-    dataset_id="placeholder",
+    name="Tool Usage - Typewriter (1 tool)",
+    dataset_id="https://smith.langchain.com/public/2300c32a-5063-4b5c-9e60-b97ad7542126/d",
     create_environment=get_environment,
     instructions=(
-        "Repeat the given string by using the provided tools. "
+        "Repeat the given string using the provided tools. "
         "Do not write anything else or provide any explanations. "
-        "For example, if the string is 'abc', you must invoke the tools "
-        "'a', 'b', and 'c' in that order. "
-        "Please invoke the function with a single letter at a time."
+        "For example, if the string is 'abc', you must print the letters "
+        "'a', 'b', and 'c' one at a time and in that order. "
     ),
     description=(
         """\
-Environment with a single function that accepts a single letter as input, and \
-"prints" it on a piece of paper.
+Environment with a single tool that accepts a single letter as input, and \
+prints it on a piece of virtual paper.
 
-The objective of this task is to evaluate the ability to use the provided \
+The objective of this task is to evaluate the ability of the model to use the provided \
 tools to repeat a given input string.
 
 For example, if the string is 'abc', the tools 'a', 'b', and 'c' must be invoked \
@@ -82,3 +83,77 @@ def _read_state() -> Any:
 """
     ),
 )
+
+
+STRINGS_TO_TYPE = [
+    # letter repetition
+    "a",
+    "aa",
+    "aaa",
+    "aaaa",
+    # 3-letter words
+    "dog",
+    "cat",
+    # 4-letter words
+    "hand",
+    "head",
+    # 5-letter words
+    "house",
+    "horse",
+    # 6-letter words
+    "school",
+    "church",
+    # 7-letter words
+    "teacher",
+    "student",
+    # 8-letter words
+    "computer",
+    "keyboard",
+    # 9-letter words
+    "university",
+    "dictionary",
+    # 10-letter words
+    "information",
+    "communication",
+]
+
+
+def _create_dataset(strings: List[str]) -> List[dict]:
+    """Create the dataset."""
+    dataset = []
+    for string in strings:
+        dataset.append(
+            {
+                "question": string,
+                "expected_steps": ["type_letter"] * len(string),
+                "state": string,
+            }
+        )
+    return dataset
+
+
+DATASET = _create_dataset(STRINGS_TO_TYPE)
+
+
+def _create_dataset() -> None:
+    """Create a dataset with the langsmith client."""
+    from langsmith.client import Client
+
+    client = Client()
+    dataset = client.create_dataset(
+        dataset_name=TYPE_WRITER_TASK.name,
+        description=TYPE_WRITER_TASK.description,
+    )
+
+    for example in DATASET:
+        client.create_example(
+            inputs={
+                "question": example["question"],
+            },
+            outputs={
+                "output": example["state"],
+                "expected_steps": example["expected_steps"],
+                "state": example["state"],
+            },
+            dataset_id=dataset.id,
+        )