diff --git a/libs/core/langchain_core/output_parsers/__init__.py b/libs/core/langchain_core/output_parsers/__init__.py
index 1bed8f58298ec..2acaab77b92a5 100644
--- a/libs/core/langchain_core/output_parsers/__init__.py
+++ b/libs/core/langchain_core/output_parsers/__init__.py
@@ -3,6 +3,7 @@
BaseLLMOutputParser,
BaseOutputParser,
)
+from langchain_core.output_parsers.json import SimpleJsonOutputParser
from langchain_core.output_parsers.list import (
CommaSeparatedListOutputParser,
ListOutputParser,
@@ -14,6 +15,7 @@
BaseCumulativeTransformOutputParser,
BaseTransformOutputParser,
)
+from langchain_core.output_parsers.xml import XMLOutputParser
__all__ = [
"BaseLLMOutputParser",
@@ -26,4 +28,6 @@
"StrOutputParser",
"BaseTransformOutputParser",
"BaseCumulativeTransformOutputParser",
+ "SimpleJsonOutputParser",
+ "XMLOutputParser",
]
diff --git a/libs/core/langchain_core/output_parsers/json.py b/libs/core/langchain_core/output_parsers/json.py
new file mode 100644
index 0000000000000..d2d7254a590aa
--- /dev/null
+++ b/libs/core/langchain_core/output_parsers/json.py
@@ -0,0 +1,195 @@
+from __future__ import annotations
+
+import json
+import re
+from json import JSONDecodeError
+from typing import Any, Callable, List, Optional
+
+import jsonpatch # type: ignore[import]
+
+from langchain_core.exceptions import OutputParserException
+from langchain_core.output_parsers.transform import BaseCumulativeTransformOutputParser
+
+
+def _replace_new_line(match: re.Match[str]) -> str:
+ value = match.group(2)
+ value = re.sub(r"\n", r"\\n", value)
+ value = re.sub(r"\r", r"\\r", value)
+ value = re.sub(r"\t", r"\\t", value)
+ value = re.sub(r'(? str:
+ """
+ The LLM response for `action_input` may be a multiline
+ string containing unescaped newlines, tabs or quotes. This function
+ replaces those characters with their escaped counterparts.
+ (newlines in JSON must be double-escaped: `\\n`)
+ """
+ if isinstance(multiline_string, (bytes, bytearray)):
+ multiline_string = multiline_string.decode()
+
+ multiline_string = re.sub(
+ r'("action_input"\:\s*")(.*)(")',
+ _replace_new_line,
+ multiline_string,
+ flags=re.DOTALL,
+ )
+
+ return multiline_string
+
+
+# Adapted from https://github.com/KillianLucas/open-interpreter/blob/main/interpreter/utils/parse_partial_json.py
+# MIT License
+def parse_partial_json(s: str, *, strict: bool = False) -> Any:
+ """Parse a JSON string that may be missing closing braces.
+
+ Args:
+ s: The JSON string to parse.
+ strict: Whether to use strict parsing. Defaults to False.
+
+ Returns:
+ The parsed JSON object as a Python dictionary.
+ """
+ # Attempt to parse the string as-is.
+ try:
+ return json.loads(s, strict=strict)
+ except json.JSONDecodeError:
+ pass
+
+ # Initialize variables.
+ new_s = ""
+ stack = []
+ is_inside_string = False
+ escaped = False
+
+ # Process each character in the string one at a time.
+ for char in s:
+ if is_inside_string:
+ if char == '"' and not escaped:
+ is_inside_string = False
+ elif char == "\n" and not escaped:
+ char = "\\n" # Replace the newline character with the escape sequence.
+ elif char == "\\":
+ escaped = not escaped
+ else:
+ escaped = False
+ else:
+ if char == '"':
+ is_inside_string = True
+ escaped = False
+ elif char == "{":
+ stack.append("}")
+ elif char == "[":
+ stack.append("]")
+ elif char == "}" or char == "]":
+ if stack and stack[-1] == char:
+ stack.pop()
+ else:
+ # Mismatched closing character; the input is malformed.
+ return None
+
+ # Append the processed character to the new string.
+ new_s += char
+
+ # If we're still inside a string at the end of processing,
+ # we need to close the string.
+ if is_inside_string:
+ new_s += '"'
+
+ # Close any remaining open structures in the reverse order that they were opened.
+ for closing_char in reversed(stack):
+ new_s += closing_char
+
+ # Attempt to parse the modified string as JSON.
+ try:
+ return json.loads(new_s, strict=strict)
+ except json.JSONDecodeError:
+ # If we still can't parse the string as JSON, return None to indicate failure.
+ return None
+
+
+def parse_json_markdown(
+ json_string: str, *, parser: Callable[[str], Any] = json.loads
+) -> dict:
+ """
+ Parse a JSON string from a Markdown string.
+
+ Args:
+ json_string: The Markdown string.
+
+ Returns:
+ The parsed JSON object as a Python dictionary.
+ """
+ # Try to find JSON string within triple backticks
+ match = re.search(r"```(json)?(.*)```", json_string, re.DOTALL)
+
+ # If no match found, assume the entire string is a JSON string
+ if match is None:
+ json_str = json_string
+ else:
+ # If match found, use the content within the backticks
+ json_str = match.group(2)
+
+ # Strip whitespace and newlines from the start and end
+ json_str = json_str.strip()
+
+ # handle newlines and other special characters inside the returned value
+ json_str = _custom_parser(json_str)
+
+ # Parse the JSON string into a Python dictionary
+ parsed = parser(json_str)
+
+ return parsed
+
+
+def parse_and_check_json_markdown(text: str, expected_keys: List[str]) -> dict:
+ """
+ Parse a JSON string from a Markdown string and check that it
+ contains the expected keys.
+
+ Args:
+ text: The Markdown string.
+ expected_keys: The expected keys in the JSON string.
+
+ Returns:
+ The parsed JSON object as a Python dictionary.
+ """
+ try:
+ json_obj = parse_json_markdown(text)
+ except json.JSONDecodeError as e:
+ raise OutputParserException(f"Got invalid JSON object. Error: {e}")
+ for key in expected_keys:
+ if key not in json_obj:
+ raise OutputParserException(
+ f"Got invalid return object. Expected key `{key}` "
+ f"to be present, but got {json_obj}"
+ )
+ return json_obj
+
+
+class SimpleJsonOutputParser(BaseCumulativeTransformOutputParser[Any]):
+ """Parse the output of an LLM call to a JSON object.
+
+ When used in streaming mode, it will yield partial JSON objects containing
+ all the keys that have been returned so far.
+
+ In streaming, if `diff` is set to `True`, yields JSONPatch operations
+ describing the difference between the previous and the current object.
+ """
+
+ def _diff(self, prev: Optional[Any], next: Any) -> Any:
+ return jsonpatch.make_patch(prev, next).patch
+
+ def parse(self, text: str) -> Any:
+ text = text.strip()
+ try:
+ return parse_json_markdown(text.strip(), parser=parse_partial_json)
+ except JSONDecodeError as e:
+ raise OutputParserException(f"Invalid json output: {text}") from e
+
+ @property
+ def _type(self) -> str:
+ return "simple_json_output_parser"
diff --git a/libs/core/langchain_core/output_parsers/xml.py b/libs/core/langchain_core/output_parsers/xml.py
new file mode 100644
index 0000000000000..43de770d0b6bf
--- /dev/null
+++ b/libs/core/langchain_core/output_parsers/xml.py
@@ -0,0 +1,135 @@
+import re
+import xml.etree.ElementTree as ET
+from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union
+
+from langchain_core.messages import BaseMessage
+from langchain_core.output_parsers.transform import BaseTransformOutputParser
+from langchain_core.runnables.utils import AddableDict
+
+XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
+1. Output should conform to the tags below.
+2. If tags are not given, make them on your own.
+3. Remember to always open and close all the tags.
+
+As an example, for the tags ["foo", "bar", "baz"]:
+1. String "\n \n \n \n" is a well-formatted instance of the schema.
+2. String "\n \n " is a badly-formatted instance.
+3. String "\n \n \n" is a badly-formatted instance.
+
+Here are the output tags:
+```
+{tags}
+```""" # noqa: E501
+
+
+class XMLOutputParser(BaseTransformOutputParser):
+ """Parse an output using xml format."""
+
+ tags: Optional[List[str]] = None
+ encoding_matcher: re.Pattern = re.compile(
+ r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL
+ )
+
+ def get_format_instructions(self) -> str:
+ return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags)
+
+ def parse(self, text: str) -> Dict[str, List[Any]]:
+ text = text.strip("`").strip("xml")
+ encoding_match = self.encoding_matcher.search(text)
+ if encoding_match:
+ text = encoding_match.group(2)
+
+ text = text.strip()
+ if (text.startswith("<") or text.startswith("\n<")) and (
+ text.endswith(">") or text.endswith(">\n")
+ ):
+ root = ET.fromstring(text)
+ return self._root_to_dict(root)
+ else:
+ raise ValueError(f"Could not parse output: {text}")
+
+ def _transform(
+ self, input: Iterator[Union[str, BaseMessage]]
+ ) -> Iterator[AddableDict]:
+ parser = ET.XMLPullParser(["start", "end"])
+ current_path: List[str] = []
+ current_path_has_children = False
+ for chunk in input:
+ if isinstance(chunk, BaseMessage):
+ # extract text
+ chunk_content = chunk.content
+ if not isinstance(chunk_content, str):
+ continue
+ chunk = chunk_content
+ # pass chunk to parser
+ parser.feed(chunk)
+ # yield all events
+ for event, elem in parser.read_events():
+ if event == "start":
+ # update current path
+ current_path.append(elem.tag)
+ current_path_has_children = False
+ elif event == "end":
+ # remove last element from current path
+ current_path.pop()
+ # yield element
+ if not current_path_has_children:
+ yield nested_element(current_path, elem)
+ # prevent yielding of parent element
+ current_path_has_children = True
+ # close parser
+ parser.close()
+
+ async def _atransform(
+ self, input: AsyncIterator[Union[str, BaseMessage]]
+ ) -> AsyncIterator[AddableDict]:
+ parser = ET.XMLPullParser(["start", "end"])
+ current_path: List[str] = []
+ current_path_has_children = False
+ async for chunk in input:
+ if isinstance(chunk, BaseMessage):
+ # extract text
+ chunk_content = chunk.content
+ if not isinstance(chunk_content, str):
+ continue
+ chunk = chunk_content
+ # pass chunk to parser
+ parser.feed(chunk)
+ # yield all events
+ for event, elem in parser.read_events():
+ if event == "start":
+ # update current path
+ current_path.append(elem.tag)
+ current_path_has_children = False
+ elif event == "end":
+ # remove last element from current path
+ current_path.pop()
+ # yield element
+ if not current_path_has_children:
+ yield nested_element(current_path, elem)
+ # prevent yielding of parent element
+ current_path_has_children = True
+ # close parser
+ parser.close()
+
+ def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
+ """Converts xml tree to python dictionary."""
+ result: Dict[str, List[Any]] = {root.tag: []}
+ for child in root:
+ if len(child) == 0:
+ result[root.tag].append({child.tag: child.text})
+ else:
+ result[root.tag].append(self._root_to_dict(child))
+ return result
+
+ @property
+ def _type(self) -> str:
+ return "xml"
+
+
+def nested_element(path: List[str], elem: ET.Element) -> Any:
+ """Get nested element from path."""
+ if len(path) == 0:
+ return AddableDict({elem.tag: elem.text})
+ else:
+ return AddableDict({path[0]: [nested_element(path[1:], elem)]})
diff --git a/libs/core/tests/unit_tests/output_parsers/test_imports.py b/libs/core/tests/unit_tests/output_parsers/test_imports.py
index 45a779225e70b..d1ee5c1b4a61b 100644
--- a/libs/core/tests/unit_tests/output_parsers/test_imports.py
+++ b/libs/core/tests/unit_tests/output_parsers/test_imports.py
@@ -11,6 +11,8 @@
"StrOutputParser",
"BaseTransformOutputParser",
"BaseCumulativeTransformOutputParser",
+ "SimpleJsonOutputParser",
+ "XMLOutputParser",
]
diff --git a/libs/core/tests/unit_tests/output_parsers/test_json.py b/libs/core/tests/unit_tests/output_parsers/test_json.py
new file mode 100644
index 0000000000000..8b2bc7d29a4f7
--- /dev/null
+++ b/libs/core/tests/unit_tests/output_parsers/test_json.py
@@ -0,0 +1,488 @@
+import json
+from typing import Any, AsyncIterator, Iterator, Tuple
+
+import pytest
+
+from langchain_core.output_parsers.json import (
+ SimpleJsonOutputParser,
+ parse_json_markdown,
+ parse_partial_json,
+)
+
+GOOD_JSON = """```json
+{
+ "foo": "bar"
+}
+```"""
+
+JSON_WITH_NEW_LINES = """
+
+```json
+{
+ "foo": "bar"
+}
+```
+
+"""
+
+JSON_WITH_NEW_LINES_INSIDE = """```json
+{
+
+ "foo": "bar"
+
+}
+```"""
+
+JSON_WITH_NEW_LINES_EVERYWHERE = """
+
+```json
+
+{
+
+ "foo": "bar"
+
+}
+
+```
+
+"""
+
+TICKS_WITH_NEW_LINES_EVERYWHERE = """
+
+```
+
+{
+
+ "foo": "bar"
+
+}
+
+```
+
+"""
+
+JSON_WITH_MARKDOWN_CODE_BLOCK = """```json
+{
+ "foo": "```bar```"
+}
+```"""
+
+JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES = """```json
+{
+ "action": "Final Answer",
+ "action_input": "```bar\n
\n\ttext\n
```"
+}
+```"""
+
+JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON = """```json
+{
+ "action": "Final Answer",
+ "action_input": "{"foo": "bar", "bar": "foo"}"
+}
+```"""
+
+JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON = """```json
+{
+ "action": "Final Answer",
+ "action_input": "{\"foo\": \"bar\", \"bar\": \"foo\"}"
+}
+```"""
+
+JSON_WITH_PYTHON_DICT = """```json
+{
+ "action": "Final Answer",
+ "action_input": {"foo": "bar", "bar": "foo"}
+}
+```"""
+
+JSON_WITH_ESCAPED_DOUBLE_QUOTES_IN_NESTED_JSON = """```json
+{
+ "action": "Final Answer",
+ "action_input": "{\\"foo\\": \\"bar\\", \\"bar\\": \\"foo\\"}"
+}
+```"""
+
+NO_TICKS = """{
+ "foo": "bar"
+}"""
+
+NO_TICKS_WHITE_SPACE = """
+{
+ "foo": "bar"
+}
+"""
+
+TEXT_BEFORE = """Thought: I need to use the search tool
+
+Action:
+```
+{
+ "foo": "bar"
+}
+```"""
+
+TEXT_AFTER = """```
+{
+ "foo": "bar"
+}
+```
+This should do the trick"""
+
+TEXT_BEFORE_AND_AFTER = """Action: Testing
+
+```
+{
+ "foo": "bar"
+}
+```
+This should do the trick"""
+
+TEST_CASES = [
+ GOOD_JSON,
+ JSON_WITH_NEW_LINES,
+ JSON_WITH_NEW_LINES_INSIDE,
+ JSON_WITH_NEW_LINES_EVERYWHERE,
+ TICKS_WITH_NEW_LINES_EVERYWHERE,
+ NO_TICKS,
+ NO_TICKS_WHITE_SPACE,
+ TEXT_BEFORE,
+ TEXT_AFTER,
+]
+
+
+@pytest.mark.parametrize("json_string", TEST_CASES)
+def test_parse_json(json_string: str) -> None:
+ parsed = parse_json_markdown(json_string)
+ assert parsed == {"foo": "bar"}
+
+
+def test_parse_json_with_code_blocks() -> None:
+ parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
+ assert parsed == {"foo": "```bar```"}
+
+ parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
+
+ assert parsed == {
+ "action": "Final Answer",
+ "action_input": '```bar\n\n\ttext\n
```',
+ }
+
+
+TEST_CASES_ESCAPED_QUOTES = [
+ JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON,
+ JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON,
+ JSON_WITH_ESCAPED_DOUBLE_QUOTES_IN_NESTED_JSON,
+]
+
+
+@pytest.mark.parametrize("json_string", TEST_CASES_ESCAPED_QUOTES)
+def test_parse_nested_json_with_escaped_quotes(json_string: str) -> None:
+ parsed = parse_json_markdown(json_string)
+ assert parsed == {
+ "action": "Final Answer",
+ "action_input": '{"foo": "bar", "bar": "foo"}',
+ }
+
+
+def test_parse_json_with_python_dict() -> None:
+ parsed = parse_json_markdown(JSON_WITH_PYTHON_DICT)
+ assert parsed == {
+ "action": "Final Answer",
+ "action_input": {"foo": "bar", "bar": "foo"},
+ }
+
+
+TEST_CASES_PARTIAL = [
+ ('{"foo": "bar", "bar": "foo"}', '{"foo": "bar", "bar": "foo"}'),
+ ('{"foo": "bar", "bar": "foo', '{"foo": "bar", "bar": "foo"}'),
+ ('{"foo": "bar", "bar": "foo}', '{"foo": "bar", "bar": "foo}"}'),
+ ('{"foo": "bar", "bar": "foo[', '{"foo": "bar", "bar": "foo["}'),
+ ('{"foo": "bar", "bar": "foo\\"', '{"foo": "bar", "bar": "foo\\""}'),
+]
+
+
+@pytest.mark.parametrize("json_strings", TEST_CASES_PARTIAL)
+def test_parse_partial_json(json_strings: Tuple[str, str]) -> None:
+ case, expected = json_strings
+ parsed = parse_partial_json(case)
+ assert parsed == json.loads(expected)
+
+
+STREAMED_TOKENS = """
+{
+
+ "
+setup
+":
+ "
+Why
+ did
+ the
+ bears
+ start
+ a
+ band
+ called
+ Bears
+ Bears
+ Bears
+ ?
+"
+,
+ "
+punchline
+":
+ "
+Because
+ they
+ wanted
+ to
+ play
+ bear
+ -y
+ good
+ music
+ !
+"
+,
+ "
+audience
+":
+ [
+"
+Haha
+"
+,
+ "
+So
+ funny
+"
+]
+
+}
+""".splitlines()
+
+EXPECTED_STREAMED_JSON = [
+ {},
+ {"setup": ""},
+ {"setup": "Why"},
+ {"setup": "Why did"},
+ {"setup": "Why did the"},
+ {"setup": "Why did the bears"},
+ {"setup": "Why did the bears start"},
+ {"setup": "Why did the bears start a"},
+ {"setup": "Why did the bears start a band"},
+ {"setup": "Why did the bears start a band called"},
+ {"setup": "Why did the bears start a band called Bears"},
+ {"setup": "Why did the bears start a band called Bears Bears"},
+ {"setup": "Why did the bears start a band called Bears Bears Bears"},
+ {"setup": "Why did the bears start a band called Bears Bears Bears ?"},
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play bear",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play bear -y",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play bear -y good",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play bear -y good music",
+ },
+ {
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "punchline": "Because they wanted to play bear -y good music !",
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": [],
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": [""],
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": ["Haha"],
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": ["Haha", ""],
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": ["Haha", "So"],
+ },
+ {
+ "punchline": "Because they wanted to play bear -y good music !",
+ "setup": "Why did the bears start a band called Bears Bears Bears ?",
+ "audience": ["Haha", "So funny"],
+ },
+]
+
+EXPECTED_STREAMED_JSON_DIFF = [
+ [{"op": "replace", "path": "", "value": {}}],
+ [{"op": "add", "path": "/setup", "value": ""}],
+ [{"op": "replace", "path": "/setup", "value": "Why"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did the"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did the bears"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did the bears start"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did the bears start a"}],
+ [{"op": "replace", "path": "/setup", "value": "Why did the bears start a band"}],
+ [
+ {
+ "op": "replace",
+ "path": "/setup",
+ "value": "Why did the bears start a band called",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/setup",
+ "value": "Why did the bears start a band called Bears",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/setup",
+ "value": "Why did the bears start a band called Bears Bears",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/setup",
+ "value": "Why did the bears start a band called Bears Bears Bears",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/setup",
+ "value": "Why did the bears start a band called Bears Bears Bears ?",
+ }
+ ],
+ [{"op": "add", "path": "/punchline", "value": ""}],
+ [{"op": "replace", "path": "/punchline", "value": "Because"}],
+ [{"op": "replace", "path": "/punchline", "value": "Because they"}],
+ [{"op": "replace", "path": "/punchline", "value": "Because they wanted"}],
+ [{"op": "replace", "path": "/punchline", "value": "Because they wanted to"}],
+ [{"op": "replace", "path": "/punchline", "value": "Because they wanted to play"}],
+ [
+ {
+ "op": "replace",
+ "path": "/punchline",
+ "value": "Because they wanted to play bear",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/punchline",
+ "value": "Because they wanted to play bear -y",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/punchline",
+ "value": "Because they wanted to play bear -y good",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/punchline",
+ "value": "Because they wanted to play bear -y good music",
+ }
+ ],
+ [
+ {
+ "op": "replace",
+ "path": "/punchline",
+ "value": "Because they wanted to play bear -y good music !",
+ }
+ ],
+ [{"op": "add", "path": "/audience", "value": []}],
+ [{"op": "add", "path": "/audience/0", "value": ""}],
+ [{"op": "replace", "path": "/audience/0", "value": "Haha"}],
+ [{"op": "add", "path": "/audience/1", "value": ""}],
+ [{"op": "replace", "path": "/audience/1", "value": "So"}],
+ [{"op": "replace", "path": "/audience/1", "value": "So funny"}],
+]
+
+
+def test_partial_text_json_output_parser() -> None:
+ def input_iter(_: Any) -> Iterator[str]:
+ for token in STREAMED_TOKENS:
+ yield token
+
+ chain = input_iter | SimpleJsonOutputParser()
+
+ assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON
+
+
+def test_partial_text_json_output_parser_diff() -> None:
+ def input_iter(_: Any) -> Iterator[str]:
+ for token in STREAMED_TOKENS:
+ yield token
+
+ chain = input_iter | SimpleJsonOutputParser(diff=True)
+
+ assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON_DIFF
+
+
+async def test_partial_text_json_output_parser_async() -> None:
+ async def input_iter(_: Any) -> AsyncIterator[str]:
+ for token in STREAMED_TOKENS:
+ yield token
+
+ chain = input_iter | SimpleJsonOutputParser()
+
+ assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON
+
+
+async def test_partial_text_json_output_parser_diff_async() -> None:
+ async def input_iter(_: Any) -> AsyncIterator[str]:
+ for token in STREAMED_TOKENS:
+ yield token
+
+ chain = input_iter | SimpleJsonOutputParser(diff=True)
+
+ assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON_DIFF
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py b/libs/core/tests/unit_tests/output_parsers/test_xml_parser.py
similarity index 95%
rename from libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py
rename to libs/core/tests/unit_tests/output_parsers/test_xml_parser.py
index 1989ddeefba47..fb92e96331a9c 100644
--- a/libs/langchain/tests/unit_tests/output_parsers/test_xml_parser.py
+++ b/libs/core/tests/unit_tests/output_parsers/test_xml_parser.py
@@ -1,7 +1,7 @@
"""Test XMLOutputParser"""
import pytest
-from langchain.output_parsers.xml import XMLOutputParser
+from langchain_core.output_parsers.xml import XMLOutputParser
DEF_RESULT_ENCODING = """
diff --git a/libs/langchain/langchain/output_parsers/format_instructions.py b/libs/langchain/langchain/output_parsers/format_instructions.py
index 94ef87210fab1..d75e5b50729f7 100644
--- a/libs/langchain/langchain/output_parsers/format_instructions.py
+++ b/libs/langchain/langchain/output_parsers/format_instructions.py
@@ -47,21 +47,6 @@
Make sure to always enclose the YAML output in triple backticks (```)"""
-XML_FORMAT_INSTRUCTIONS = """The output should be formatted as a XML file.
-1. Output should conform to the tags below.
-2. If tags are not given, make them on your own.
-3. Remember to always open and close all the tags.
-
-As an example, for the tags ["foo", "bar", "baz"]:
-1. String "\n \n \n \n" is a well-formatted instance of the schema.
-2. String "\n \n " is a badly-formatted instance.
-3. String "\n \n \n" is a badly-formatted instance.
-
-Here are the output tags:
-```
-{tags}
-```"""
-
PANDAS_DATAFRAME_FORMAT_INSTRUCTIONS = """The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
diff --git a/libs/langchain/langchain/output_parsers/json.py b/libs/langchain/langchain/output_parsers/json.py
index dacf92f2d37a5..b0263889daa1e 100644
--- a/libs/langchain/langchain/output_parsers/json.py
+++ b/libs/langchain/langchain/output_parsers/json.py
@@ -1,194 +1,13 @@
-from __future__ import annotations
-
-import json
-import re
-from json import JSONDecodeError
-from typing import Any, Callable, List, Optional
-
-import jsonpatch
-from langchain_core.exceptions import OutputParserException
-from langchain_core.output_parsers import BaseCumulativeTransformOutputParser
-
-
-def _replace_new_line(match: re.Match[str]) -> str:
- value = match.group(2)
- value = re.sub(r"\n", r"\\n", value)
- value = re.sub(r"\r", r"\\r", value)
- value = re.sub(r"\t", r"\\t", value)
- value = re.sub(r'(? str:
- """
- The LLM response for `action_input` may be a multiline
- string containing unescaped newlines, tabs or quotes. This function
- replaces those characters with their escaped counterparts.
- (newlines in JSON must be double-escaped: `\\n`)
- """
- if isinstance(multiline_string, (bytes, bytearray)):
- multiline_string = multiline_string.decode()
-
- multiline_string = re.sub(
- r'("action_input"\:\s*")(.*)(")',
- _replace_new_line,
- multiline_string,
- flags=re.DOTALL,
- )
-
- return multiline_string
-
-
-# Adapted from https://github.com/KillianLucas/open-interpreter/blob/main/interpreter/utils/parse_partial_json.py
-# MIT License
-def parse_partial_json(s: str, *, strict: bool = False) -> Any:
- """Parse a JSON string that may be missing closing braces.
-
- Args:
- s: The JSON string to parse.
- strict: Whether to use strict parsing. Defaults to False.
-
- Returns:
- The parsed JSON object as a Python dictionary.
- """
- # Attempt to parse the string as-is.
- try:
- return json.loads(s, strict=strict)
- except json.JSONDecodeError:
- pass
-
- # Initialize variables.
- new_s = ""
- stack = []
- is_inside_string = False
- escaped = False
-
- # Process each character in the string one at a time.
- for char in s:
- if is_inside_string:
- if char == '"' and not escaped:
- is_inside_string = False
- elif char == "\n" and not escaped:
- char = "\\n" # Replace the newline character with the escape sequence.
- elif char == "\\":
- escaped = not escaped
- else:
- escaped = False
- else:
- if char == '"':
- is_inside_string = True
- escaped = False
- elif char == "{":
- stack.append("}")
- elif char == "[":
- stack.append("]")
- elif char == "}" or char == "]":
- if stack and stack[-1] == char:
- stack.pop()
- else:
- # Mismatched closing character; the input is malformed.
- return None
-
- # Append the processed character to the new string.
- new_s += char
-
- # If we're still inside a string at the end of processing,
- # we need to close the string.
- if is_inside_string:
- new_s += '"'
-
- # Close any remaining open structures in the reverse order that they were opened.
- for closing_char in reversed(stack):
- new_s += closing_char
-
- # Attempt to parse the modified string as JSON.
- try:
- return json.loads(new_s, strict=strict)
- except json.JSONDecodeError:
- # If we still can't parse the string as JSON, return None to indicate failure.
- return None
-
-
-def parse_json_markdown(
- json_string: str, *, parser: Callable[[str], Any] = json.loads
-) -> dict:
- """
- Parse a JSON string from a Markdown string.
-
- Args:
- json_string: The Markdown string.
-
- Returns:
- The parsed JSON object as a Python dictionary.
- """
- # Try to find JSON string within triple backticks
- match = re.search(r"```(json)?(.*)```", json_string, re.DOTALL)
-
- # If no match found, assume the entire string is a JSON string
- if match is None:
- json_str = json_string
- else:
- # If match found, use the content within the backticks
- json_str = match.group(2)
-
- # Strip whitespace and newlines from the start and end
- json_str = json_str.strip()
-
- # handle newlines and other special characters inside the returned value
- json_str = _custom_parser(json_str)
-
- # Parse the JSON string into a Python dictionary
- parsed = parser(json_str)
-
- return parsed
-
-
-def parse_and_check_json_markdown(text: str, expected_keys: List[str]) -> dict:
- """
- Parse a JSON string from a Markdown string and check that it
- contains the expected keys.
-
- Args:
- text: The Markdown string.
- expected_keys: The expected keys in the JSON string.
-
- Returns:
- The parsed JSON object as a Python dictionary.
- """
- try:
- json_obj = parse_json_markdown(text)
- except json.JSONDecodeError as e:
- raise OutputParserException(f"Got invalid JSON object. Error: {e}")
- for key in expected_keys:
- if key not in json_obj:
- raise OutputParserException(
- f"Got invalid return object. Expected key `{key}` "
- f"to be present, but got {json_obj}"
- )
- return json_obj
-
-
-class SimpleJsonOutputParser(BaseCumulativeTransformOutputParser[Any]):
- """Parse the output of an LLM call to a JSON object.
-
- When used in streaming mode, it will yield partial JSON objects containing
- all the keys that have been returned so far.
-
- In streaming, if `diff` is set to `True`, yields JSONPatch operations
- describing the difference between the previous and the current object.
- """
-
- def _diff(self, prev: Optional[Any], next: Any) -> Any:
- return jsonpatch.make_patch(prev, next).patch
-
- def parse(self, text: str) -> Any:
- text = text.strip()
- try:
- return parse_json_markdown(text.strip(), parser=parse_partial_json)
- except JSONDecodeError as e:
- raise OutputParserException(f"Invalid json output: {text}") from e
-
- @property
- def _type(self) -> str:
- return "simple_json_output_parser"
+from langchain_core.output_parsers.json import (
+ SimpleJsonOutputParser,
+ parse_and_check_json_markdown,
+ parse_json_markdown,
+ parse_partial_json,
+)
+
+__all__ = [
+ "SimpleJsonOutputParser",
+ "parse_partial_json",
+ "parse_json_markdown",
+ "parse_and_check_json_markdown",
+]
diff --git a/libs/langchain/langchain/output_parsers/xml.py b/libs/langchain/langchain/output_parsers/xml.py
index ee0a9e86179b2..655dc65b9ef07 100644
--- a/libs/langchain/langchain/output_parsers/xml.py
+++ b/libs/langchain/langchain/output_parsers/xml.py
@@ -1,122 +1,3 @@
-import re
-import xml.etree.ElementTree as ET
-from typing import Any, AsyncIterator, Dict, Iterator, List, Optional, Union
+from langchain_core.output_parsers.xml import XMLOutputParser
-from langchain_core.messages import BaseMessage
-from langchain_core.output_parsers.transform import BaseTransformOutputParser
-from langchain_core.runnables.utils import AddableDict
-
-from langchain.output_parsers.format_instructions import XML_FORMAT_INSTRUCTIONS
-
-
-class XMLOutputParser(BaseTransformOutputParser):
- """Parse an output using xml format."""
-
- tags: Optional[List[str]] = None
- encoding_matcher: re.Pattern = re.compile(
- r"<([^>]*encoding[^>]*)>\n(.*)", re.MULTILINE | re.DOTALL
- )
-
- def get_format_instructions(self) -> str:
- return XML_FORMAT_INSTRUCTIONS.format(tags=self.tags)
-
- def parse(self, text: str) -> Dict[str, List[Any]]:
- text = text.strip("`").strip("xml")
- encoding_match = self.encoding_matcher.search(text)
- if encoding_match:
- text = encoding_match.group(2)
-
- text = text.strip()
- if (text.startswith("<") or text.startswith("\n<")) and (
- text.endswith(">") or text.endswith(">\n")
- ):
- root = ET.fromstring(text)
- return self._root_to_dict(root)
- else:
- raise ValueError(f"Could not parse output: {text}")
-
- def _transform(
- self, input: Iterator[Union[str, BaseMessage]]
- ) -> Iterator[AddableDict]:
- parser = ET.XMLPullParser(["start", "end"])
- current_path: List[str] = []
- current_path_has_children = False
- for chunk in input:
- if isinstance(chunk, BaseMessage):
- # extract text
- chunk_content = chunk.content
- if not isinstance(chunk_content, str):
- continue
- chunk = chunk_content
- # pass chunk to parser
- parser.feed(chunk)
- # yield all events
- for event, elem in parser.read_events():
- if event == "start":
- # update current path
- current_path.append(elem.tag)
- current_path_has_children = False
- elif event == "end":
- # remove last element from current path
- current_path.pop()
- # yield element
- if not current_path_has_children:
- yield nested_element(current_path, elem)
- # prevent yielding of parent element
- current_path_has_children = True
- # close parser
- parser.close()
-
- async def _atransform(
- self, input: AsyncIterator[Union[str, BaseMessage]]
- ) -> AsyncIterator[AddableDict]:
- parser = ET.XMLPullParser(["start", "end"])
- current_path: List[str] = []
- current_path_has_children = False
- async for chunk in input:
- if isinstance(chunk, BaseMessage):
- # extract text
- chunk_content = chunk.content
- if not isinstance(chunk_content, str):
- continue
- chunk = chunk_content
- # pass chunk to parser
- parser.feed(chunk)
- # yield all events
- for event, elem in parser.read_events():
- if event == "start":
- # update current path
- current_path.append(elem.tag)
- current_path_has_children = False
- elif event == "end":
- # remove last element from current path
- current_path.pop()
- # yield element
- if not current_path_has_children:
- yield nested_element(current_path, elem)
- # prevent yielding of parent element
- current_path_has_children = True
- # close parser
- parser.close()
-
- def _root_to_dict(self, root: ET.Element) -> Dict[str, List[Any]]:
- """Converts xml tree to python dictionary."""
- result: Dict[str, List[Any]] = {root.tag: []}
- for child in root:
- if len(child) == 0:
- result[root.tag].append({child.tag: child.text})
- else:
- result[root.tag].append(self._root_to_dict(child))
- return result
-
- @property
- def _type(self) -> str:
- return "xml"
-
-
-def nested_element(path: List[str], elem: ET.Element) -> Any:
- """Get nested element from path."""
- if len(path) == 0:
- return AddableDict({elem.tag: elem.text})
- else:
- return AddableDict({path[0]: [nested_element(path[1:], elem)]})
+__all__ = ["XMLOutputParser"]
diff --git a/libs/langchain/tests/unit_tests/output_parsers/test_json.py b/libs/langchain/tests/unit_tests/output_parsers/test_json.py
index 52f59d9aaf0cc..e48ecd9b4abce 100644
--- a/libs/langchain/tests/unit_tests/output_parsers/test_json.py
+++ b/libs/langchain/tests/unit_tests/output_parsers/test_json.py
@@ -1,14 +1,7 @@
-import json
-from typing import Any, AsyncIterator, Iterator, Tuple
+from typing import Any, AsyncIterator, Iterator
-import pytest
from langchain_core.messages import AIMessageChunk
-from langchain.output_parsers.json import (
- SimpleJsonOutputParser,
- parse_json_markdown,
- parse_partial_json,
-)
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
GOOD_JSON = """```json
@@ -152,24 +145,6 @@
]
-@pytest.mark.parametrize("json_string", TEST_CASES)
-def test_parse_json(json_string: str) -> None:
- parsed = parse_json_markdown(json_string)
- assert parsed == {"foo": "bar"}
-
-
-def test_parse_json_with_code_blocks() -> None:
- parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK)
- assert parsed == {"foo": "```bar```"}
-
- parsed = parse_json_markdown(JSON_WITH_MARKDOWN_CODE_BLOCK_AND_NEWLINES)
-
- assert parsed == {
- "action": "Final Answer",
- "action_input": '```bar\n\n\ttext\n
```',
- }
-
-
TEST_CASES_ESCAPED_QUOTES = [
JSON_WITH_UNESCAPED_QUOTES_IN_NESTED_JSON,
JSON_WITH_ESCAPED_QUOTES_IN_NESTED_JSON,
@@ -177,23 +152,6 @@ def test_parse_json_with_code_blocks() -> None:
]
-@pytest.mark.parametrize("json_string", TEST_CASES_ESCAPED_QUOTES)
-def test_parse_nested_json_with_escaped_quotes(json_string: str) -> None:
- parsed = parse_json_markdown(json_string)
- assert parsed == {
- "action": "Final Answer",
- "action_input": '{"foo": "bar", "bar": "foo"}',
- }
-
-
-def test_parse_json_with_python_dict() -> None:
- parsed = parse_json_markdown(JSON_WITH_PYTHON_DICT)
- assert parsed == {
- "action": "Final Answer",
- "action_input": {"foo": "bar", "bar": "foo"},
- }
-
-
TEST_CASES_PARTIAL = [
('{"foo": "bar", "bar": "foo"}', '{"foo": "bar", "bar": "foo"}'),
('{"foo": "bar", "bar": "foo', '{"foo": "bar", "bar": "foo"}'),
@@ -203,13 +161,6 @@ def test_parse_json_with_python_dict() -> None:
]
-@pytest.mark.parametrize("json_strings", TEST_CASES_PARTIAL)
-def test_parse_partial_json(json_strings: Tuple[str, str]) -> None:
- case, expected = json_strings
- parsed = parse_partial_json(case)
- assert parsed == json.loads(expected)
-
-
STREAMED_TOKENS = """
{
@@ -450,16 +401,6 @@ def test_parse_partial_json(json_strings: Tuple[str, str]) -> None:
]
-def test_partial_text_json_output_parser() -> None:
- def input_iter(_: Any) -> Iterator[str]:
- for token in STREAMED_TOKENS:
- yield token
-
- chain = input_iter | SimpleJsonOutputParser()
-
- assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON
-
-
def test_partial_functions_json_output_parser() -> None:
def input_iter(_: Any) -> Iterator[AIMessageChunk]:
for token in STREAMED_TOKENS:
@@ -472,16 +413,6 @@ def input_iter(_: Any) -> Iterator[AIMessageChunk]:
assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON
-def test_partial_text_json_output_parser_diff() -> None:
- def input_iter(_: Any) -> Iterator[str]:
- for token in STREAMED_TOKENS:
- yield token
-
- chain = input_iter | SimpleJsonOutputParser(diff=True)
-
- assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON_DIFF
-
-
def test_partial_functions_json_output_parser_diff() -> None:
def input_iter(_: Any) -> Iterator[AIMessageChunk]:
for token in STREAMED_TOKENS:
@@ -494,16 +425,6 @@ def input_iter(_: Any) -> Iterator[AIMessageChunk]:
assert list(chain.stream(None)) == EXPECTED_STREAMED_JSON_DIFF
-async def test_partial_text_json_output_parser_async() -> None:
- async def input_iter(_: Any) -> AsyncIterator[str]:
- for token in STREAMED_TOKENS:
- yield token
-
- chain = input_iter | SimpleJsonOutputParser()
-
- assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON
-
-
async def test_partial_functions_json_output_parser_async() -> None:
async def input_iter(_: Any) -> AsyncIterator[AIMessageChunk]:
for token in STREAMED_TOKENS:
@@ -516,16 +437,6 @@ async def input_iter(_: Any) -> AsyncIterator[AIMessageChunk]:
assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON
-async def test_partial_text_json_output_parser_diff_async() -> None:
- async def input_iter(_: Any) -> AsyncIterator[str]:
- for token in STREAMED_TOKENS:
- yield token
-
- chain = input_iter | SimpleJsonOutputParser(diff=True)
-
- assert [p async for p in chain.astream(None)] == EXPECTED_STREAMED_JSON_DIFF
-
-
async def test_partial_functions_json_output_parser_diff_async() -> None:
async def input_iter(_: Any) -> AsyncIterator[AIMessageChunk]:
for token in STREAMED_TOKENS: