Merge branch 'master' into master

abhishek9998 · Aug 1, 2024 · 7f519c2 · 7f519c2
2 parents 8e6d47c + f14121f
commit 7f519c2
Show file tree

Hide file tree

Showing 3,376 changed files with 208,914 additions and 303,802 deletions.
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
@@ -10,7 +10,7 @@ You can use the dev container configuration in this folder to build and run the
 You may use the button above, or follow these steps to open this repo in a Codespace:
 1. Click the **Code** drop-down menu at the top of https://github.com/langchain-ai/langchain.
 1. Click on the **Codespaces** tab.
-1. Click **Create codespace on master** .
+1. Click **Create codespace on master**.
 
 For more info, check out the [GitHub documentation](https://docs.github.com/en/free-pro-team@latest/github/developing-online-with-codespaces/creating-a-codespace#creating-a-codespace).
 

diff --git a/.devcontainer/docker-compose.yaml b/.devcontainer/docker-compose.yaml
@@ -5,10 +5,10 @@ services:
       dockerfile: libs/langchain/dev.Dockerfile
       context: ..
     volumes:
-   # Update this to wherever you want VS Code to mount the folder of your project
+      # Update this to wherever you want VS Code to mount the folder of your project
       - ..:/workspaces/langchain:cached
     networks:
-      - langchain-network 
+      - langchain-network
   #   environment:
   #     MONGO_ROOT_USERNAME: root
   #     MONGO_ROOT_PASSWORD: example123
@@ -28,5 +28,3 @@ services:
 networks:
   langchain-network:
     driver: bridge
-
-
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -4,9 +4,6 @@ contact_links:
   - name: 🤔 Question or Problem
     about: Ask a question or ask about a problem in GitHub Discussions.
     url: https://www.github.com/langchain-ai/langchain/discussions/categories/q-a
-  - name: Discord
-    url: https://discord.gg/6adMQxSpJS
-    about: General community discussions
   - name: Feature Request
     url: https://www.github.com/langchain-ai/langchain/discussions/categories/ideas
     about: Suggest a feature or an idea

diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml
@@ -26,6 +26,13 @@ body:
       [LangChain Github Discussions](https://github.com/langchain-ai/langchain/discussions),
       [LangChain Github Issues](https://github.com/langchain-ai/langchain/issues?q=is%3Aissue),
       [LangChain ChatBot](https://chat.langchain.com/)
+- type: input
+  id: url
+  attributes:
+    label: URL
+    description: URL to documentation
+  validations:
+    required: false
 - type: checkboxes
   id: checks
   attributes:
@@ -48,4 +55,4 @@ body:
     label: "Idea or request for content:"
     description: >
       Please describe as clearly as possible what topics you think are missing
-      from the current documentation.
+      from the current documentation.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -26,4 +26,4 @@ Additional guidelines:
 - Changes should be backwards compatible.
 - If you are adding something to community, do not re-import it in langchain.
 
-If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17.
+If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.
diff --git a/.github/actions/people/app/main.py b/.github/actions/people/app/main.py
@@ -350,11 +350,7 @@ def get_graphql_pr_edges(*, settings: Settings, after: Union[str, None] = None):
         print("Querying PRs...")
     else:
         print(f"Querying PRs with cursor {after}...")
-    data = get_graphql_response(
-        settings=settings,
-        query=prs_query,
-        after=after
-    )
+    data = get_graphql_response(settings=settings, query=prs_query, after=after)
     graphql_response = PRsResponse.model_validate(data)
     return graphql_response.data.repository.pullRequests.edges
 
@@ -484,10 +480,16 @@ def get_contributors(settings: Settings):
             lines_changed = pr.additions + pr.deletions
             score = _logistic(files_changed, 20) + _logistic(lines_changed, 100)
             contributor_scores[pr.author.login] += score
-            three_months_ago = (datetime.now(timezone.utc) - timedelta(days=3*30))
+            three_months_ago = datetime.now(timezone.utc) - timedelta(days=3 * 30)
             if pr.createdAt > three_months_ago:
                 recent_contributor_scores[pr.author.login] += score
-    return contributors, contributor_scores, recent_contributor_scores, reviewers, authors
+    return (
+        contributors,
+        contributor_scores,
+        recent_contributor_scores,
+        reviewers,
+        authors,
+    )
 
 
 def get_top_users(
@@ -524,9 +526,13 @@ def get_top_users(
     # question_commentors, question_last_month_commentors, question_authors = get_experts(
     #     settings=settings
     # )
-    contributors, contributor_scores, recent_contributor_scores, reviewers, pr_authors = get_contributors(
-        settings=settings
-    )
+    (
+        contributors,
+        contributor_scores,
+        recent_contributor_scores,
+        reviewers,
+        pr_authors,
+    ) = get_contributors(settings=settings)
     # authors = {**question_authors, **pr_authors}
     authors = {**pr_authors}
     maintainers_logins = {
@@ -537,14 +543,17 @@ def get_top_users(
         "nfcampos",
         "efriis",
         "eyurtsev",
-        "rlancemartin"
+        "rlancemartin",
+        "ccurme",
+        "vbarda",
     }
     hidden_logins = {
         "dev2049",
         "vowelparrot",
         "obi1kenobi",
         "langchain-infra",
         "jacoblee93",
+        "isahers1",
         "dqbd",
         "bracesproul",
         "akira",
@@ -556,7 +565,7 @@ def get_top_users(
         maintainers.append(
             {
                 "login": login,
-                "count": contributors[login], #+ question_commentors[login],
+                "count": contributors[login],  # + question_commentors[login],
                 "avatarUrl": user.avatarUrl,
                 "twitterUsername": user.twitterUsername,
                 "url": user.url,
@@ -612,9 +621,7 @@ def get_top_users(
     new_people_content = yaml.dump(
         people, sort_keys=False, width=200, allow_unicode=True
     )
-    if (
-        people_old_content == new_people_content
-    ):
+    if people_old_content == new_people_content:
         logging.info("The LangChain People data hasn't changed, finishing.")
         sys.exit(0)
     people_path.write_text(new_people_content, encoding="utf-8")
@@ -627,9 +634,7 @@ def get_top_users(
     logging.info(f"Creating a new branch {branch_name}")
     subprocess.run(["git", "checkout", "-B", branch_name], check=True)
     logging.info("Adding updated file")
-    subprocess.run(
-        ["git", "add", str(people_path)], check=True
-    )
+    subprocess.run(["git", "add", str(people_path)], check=True)
     logging.info("Committing updated file")
     message = "👥 Update LangChain people data"
     result = subprocess.run(["git", "commit", "-m", message], check=True)
@@ -638,4 +643,4 @@ def get_top_users(
     logging.info("Creating PR")
     pr = repo.create_pull(title=message, body=message, base="master", head=branch_name)
     logging.info(f"Created PR: {pr.number}")
-    logging.info("Finished")
+    logging.info("Finished")
diff --git a/.github/scripts/check_diff.py b/.github/scripts/check_diff.py
@@ -1,16 +1,137 @@
+import glob
 import json
-import sys
 import os
-from typing import Dict
+import re
+import sys
+import tomllib
+from collections import defaultdict
+from typing import Dict, List, Set
+from pathlib import Path
+
 
 LANGCHAIN_DIRS = [
     "libs/core",
     "libs/text-splitters",
-    "libs/community",
     "libs/langchain",
+    "libs/community",
     "libs/experimental",
 ]
 
+
+def all_package_dirs() -> Set[str]:
+    return {
+        "/".join(path.split("/")[:-1]).lstrip("./")
+        for path in glob.glob("./libs/**/pyproject.toml", recursive=True)
+        if "libs/cli" not in path and "libs/standard-tests" not in path
+    }
+
+
+def dependents_graph() -> dict:
+    """
+    Construct a mapping of package -> dependents, such that we can
+    run tests on all dependents of a package when a change is made.
+    """
+    dependents = defaultdict(set)
+
+    for path in glob.glob("./libs/**/pyproject.toml", recursive=True):
+        if "template" in path:
+            continue
+
+        # load regular and test deps from pyproject.toml
+        with open(path, "rb") as f:
+            pyproject = tomllib.load(f)["tool"]["poetry"]
+
+        pkg_dir = "libs" + "/".join(path.split("libs")[1].split("/")[:-1])
+        for dep in [
+            *pyproject["dependencies"].keys(),
+            *pyproject["group"]["test"]["dependencies"].keys(),
+        ]:
+            if "langchain" in dep:
+                dependents[dep].add(pkg_dir)
+                continue
+
+        # load extended deps from extended_testing_deps.txt
+        package_path = Path(path).parent
+        extended_requirement_path = package_path / "extended_testing_deps.txt"
+        if extended_requirement_path.exists():
+            with open(extended_requirement_path, "r") as f:
+                extended_deps = f.read().splitlines()
+                for depline in extended_deps:
+                    if depline.startswith("-e "):
+                        # editable dependency
+                        assert depline.startswith(
+                            "-e ../partners/"
+                        ), "Extended test deps should only editable install partner packages"
+                        partner = depline.split("partners/")[1]
+                        dep = f"langchain-{partner}"
+                    else:
+                        dep = depline.split("==")[0]
+
+                    if "langchain" in dep:
+                        dependents[dep].add(pkg_dir)
+    return dependents
+
+
+def add_dependents(dirs_to_eval: Set[str], dependents: dict) -> List[str]:
+    updated = set()
+    for dir_ in dirs_to_eval:
+        # handle core manually because it has so many dependents
+        if "core" in dir_:
+            updated.add(dir_)
+            continue
+        pkg = "langchain-" + dir_.split("/")[-1]
+        updated.update(dependents[pkg])
+        updated.add(dir_)
+    return list(updated)
+
+
+def _get_configs_for_single_dir(job: str, dir_: str) -> List[Dict[str, str]]:
+    min_python = "3.8"
+    max_python = "3.12"
+
+    # custom logic for specific directories
+    if dir_ == "libs/partners/milvus":
+        # milvus poetry doesn't allow 3.12 because they
+        # declare deps in funny way
+        max_python = "3.11"
+
+    if dir_ in ["libs/community", "libs/langchain"] and job == "extended-tests":
+        # community extended test resolution in 3.12 is slow
+        # even in uv
+        max_python = "3.11"
+
+    if dir_ == "libs/community" and job == "compile-integration-tests":
+        # community integration deps are slow in 3.12
+        max_python = "3.11"
+
+    return [
+        {"working-directory": dir_, "python-version": min_python},
+        {"working-directory": dir_, "python-version": max_python},
+    ]
+
+
+def _get_configs_for_multi_dirs(
+    job: str, dirs_to_run: List[str], dependents: dict
+) -> List[Dict[str, str]]:
+    if job == "lint":
+        dirs = add_dependents(
+            dirs_to_run["lint"] | dirs_to_run["test"] | dirs_to_run["extended-test"],
+            dependents,
+        )
+    elif job in ["test", "compile-integration-tests", "dependencies"]:
+        dirs = add_dependents(
+            dirs_to_run["test"] | dirs_to_run["extended-test"], dependents
+        )
+    elif job == "extended-tests":
+        dirs = list(dirs_to_run["extended-test"])
+    else:
+        raise ValueError(f"Unknown job: {job}")
+
+    return [
+        config for dir_ in dirs for config in _get_configs_for_single_dir(job, dir_)
+    ]
+
+
 if __name__ == "__main__":
     files = sys.argv[1:]
 
@@ -21,10 +142,11 @@
     }
     docs_edited = False
 
-    if len(files) == 300:
+    if len(files) >= 300:
         # max diff length is 300 files - there are likely files missing
-        raise ValueError("Max diff reached. Please manually run CI on changed libs.")
-
+        dirs_to_run["lint"] = all_package_dirs()
+        dirs_to_run["test"] = all_package_dirs()
+        dirs_to_run["extended-test"] = set(LANGCHAIN_DIRS)
     for file in files:
         if any(
             file.startswith(dir_)
@@ -81,14 +203,25 @@
                 docs_edited = True
             dirs_to_run["lint"].add(".")
 
-    outputs = {
-        "dirs-to-lint": list(
-            dirs_to_run["lint"] | dirs_to_run["test"] | dirs_to_run["extended-test"]
-        ),
-        "dirs-to-test": list(dirs_to_run["test"] | dirs_to_run["extended-test"]),
-        "dirs-to-extended-test": list(dirs_to_run["extended-test"]),
-        "docs-edited": "true" if docs_edited else "",
+    dependents = dependents_graph()
+
+    # we now have dirs_by_job
+    # todo: clean this up
+
+    map_job_to_configs = {
+        job: _get_configs_for_multi_dirs(job, dirs_to_run, dependents)
+        for job in [
+            "lint",
+            "test",
+            "extended-tests",
+            "compile-integration-tests",
+            "dependencies",
+        ]
     }
-    for key, value in outputs.items():
+    map_job_to_configs["test-doc-imports"] = (
+        [{"python-version": "3.12"}] if docs_edited else []
+    )
+
+    for key, value in map_job_to_configs.items():
         json_output = json.dumps(value)
-        print(f"{key}={json_output}")  # noqa: T201
+        print(f"{key}={json_output}")
diff --git a/.github/scripts/check_prerelease_dependencies.py b/.github/scripts/check_prerelease_dependencies.py
@@ -0,0 +1,35 @@
+import sys
+import tomllib
+
+if __name__ == "__main__":
+    # Get the TOML file path from the command line argument
+    toml_file = sys.argv[1]
+
+    # read toml file
+    with open(toml_file, "rb") as file:
+        toml_data = tomllib.load(file)
+
+    # see if we're releasing an rc
+    version = toml_data["tool"]["poetry"]["version"]
+    releasing_rc = "rc" in version
+
+    # if not, iterate through dependencies and make sure none allow prereleases
+    if not releasing_rc:
+        dependencies = toml_data["tool"]["poetry"]["dependencies"]
+        for lib in dependencies:
+            dep_version = dependencies[lib]
+            dep_version_string = (
+                dep_version["version"] if isinstance(dep_version, dict) else dep_version
+            )
+
+            if "rc" in dep_version_string:
+                raise ValueError(
+                    f"Dependency {lib} has a prerelease version. Please remove this."
+                )
+
+            if isinstance(dep_version, dict) and dep_version.get(
+                "allow-prereleases", False
+            ):
+                raise ValueError(
+                    f"Dependency {lib} has allow-prereleases set to true. Please remove this."
+                )