From d82c36374ae7f87be2e06464e972252dcadae9a1 Mon Sep 17 00:00:00 2001
From: Predrag Gruevski <obi1kenobi82@gmail.com>
Date: Fri, 10 Jan 2025 00:00:25 +0000
Subject: [PATCH] Make `pydantic` model serialization consistent regardless of
 surrogates.

Without this code, Pydantic models containing surrogates get serialized differently than models that don't contain surrogates. This leads to a less smooth user experience in LangSmith for users whose data contains surrogates.

With this fix, Pydantic models and other tricky Python data types are always serialized in the same way, regardless of whether they contain surrogates or not.
---
 python/langsmith/_internal/_serde.py          |  2 +-
 python/tests/integration_tests/test_client.py | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/python/langsmith/_internal/_serde.py b/python/langsmith/_internal/_serde.py
index 1bf8865c1..d3c7d7d10 100644
--- a/python/langsmith/_internal/_serde.py
+++ b/python/langsmith/_internal/_serde.py
@@ -146,7 +146,7 @@ def dumps_json(obj: Any) -> bytes:
         logger.debug(f"Orjson serialization failed: {repr(e)}. Falling back to json.")
         result = json.dumps(
             obj,
-            default=_simple_default,
+            default=_serialize_json,
             ensure_ascii=True,
         ).encode("utf-8")
         try:
diff --git a/python/tests/integration_tests/test_client.py b/python/tests/integration_tests/test_client.py
index 3bcd9d04c..02bd6bab0 100644
--- a/python/tests/integration_tests/test_client.py
+++ b/python/tests/integration_tests/test_client.py
@@ -19,6 +19,7 @@
 from pydantic import BaseModel
 from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor
 
+from langsmith._internal._serde import dumps_json
 from langsmith.client import ID_TYPE, Client
 from langsmith.evaluation import aevaluate, evaluate
 from langsmith.schemas import (
@@ -1155,6 +1156,37 @@ def test_surrogates():
     )
 
 
+def test_fallback_json_serialization():
+    class Document(BaseModel):
+        content: str
+
+    raw_surrogates = [
+        ("Hello\ud83d\ude00", "Hello😀"),
+        ("Python\ud83d\udc0d", "Python🐍"),
+        ("Surrogate\ud834\udd1e", "Surrogate𝄞"),
+        ("Example\ud83c\udf89", "Example🎉"),
+        ("String\ud83c\udfa7", "String🎧"),
+        ("With\ud83c\udf08", "With🌈"),
+        ("Surrogates\ud83d\ude0e", "Surrogates😎"),
+        ("Embedded\ud83d\udcbb", "Embedded💻"),
+        ("In\ud83c\udf0e", "In🌎"),
+        ("The\ud83d\udcd6", "The📖"),
+        ("Text\ud83d\udcac", "Text💬"),
+        ("收花🙄·到", "收花🙄·到"),
+    ]
+    pydantic_surrogates = [
+        (Document(content=item), expected) for item, expected in raw_surrogates
+    ]
+
+    for item, expected in raw_surrogates:
+        output = dumps_json(item).decode("utf8")
+        assert f'"{expected}"' == output
+
+    for item, expected in pydantic_surrogates:
+        output = dumps_json(item).decode("utf8")
+        assert f'{{"content":"{expected}"}}' == output
+
+
 def test_runs_stats():
     langchain_client = Client()
     # We always have stuff in the "default" project...