Skip to content

Commit

Permalink
Fix deprecated load() with kwargs
Browse files Browse the repository at this point in the history
  • Loading branch information
pprados committed Jan 14, 2025
1 parent 9b45bd8 commit acf4358
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 5 deletions.
20 changes: 17 additions & 3 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,15 @@ def __init__(
self.extract_tables_settings = extract_tables_settings

def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
return self._lazy_parse(
blob,
)

def _lazy_parse(
self,
blob: Blob,
text_kwargs: Optional[dict[str, Any]] = None, # deprectaed
) -> Iterator[Document]: # type: ignore[valid-type]
"""Lazily parse the blob.
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
Expand All @@ -547,6 +556,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
try:
import pymupdf

if not text_kwargs:
text_kwargs = {}
if not self.extract_tables_settings:
from pymupdf.table import (
DEFAULT_JOIN_TOLERANCE,
Expand Down Expand Up @@ -597,7 +608,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
doc_metadata = self._extract_metadata(doc, blob)
full_content = []
for page in doc:
all_text = self._get_page_content(doc, page, blob).strip()
all_text = self._get_page_content(doc, page, text_kwargs).strip()
if self.mode == "page":
yield Document(
page_content=all_text,
Expand All @@ -615,7 +626,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
)

def _get_page_content(
self, doc: pymupdf.Document, page: pymupdf.Page, blob: Blob
self,
doc: pymupdf.Document,
page: pymupdf.Page,
text_kwargs: dict[str, Any],
) -> str:
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
Expand All @@ -628,7 +642,7 @@ def _get_page_content(
Returns:
str: The text content of the page.
"""
text_from_page = page.get_text(**self.text_kwargs)
text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs})
images_from_page = self._extract_images_from_page(doc, page)
tables_from_page = self._extract_tables_from_page(page)
extras = []
Expand Down
15 changes: 13 additions & 2 deletions libs/community/langchain_community/document_loaders/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,17 +544,28 @@ def __init__(
extract_tables_settings=extract_tables_settings,
)

def lazy_load(self) -> Iterator[Document]:
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
"""Lazy load given path as pages or single document (see `mode`).
Insert image, if possible, between two paragraphs.
In this way, a paragraph can be continued on the next page.
"""
if kwargs:
logger.warning(
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
f" is deprecated. Please pass arguments during initialization instead."
)
parser = self.parser
if self.web_path:
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
else:
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
yield from parser.lazy_parse(blob)
yield from parser._lazy_parse(blob, text_kwargs=kwargs)

def load(self, **kwargs: Any) -> list[Document]:
return list(self._lazy_load(**kwargs))

def lazy_load(self) -> Iterator[Document]:
yield from self._lazy_load()


# MathpixPDFLoader implementation taken largely from Daniel Gross's:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,11 @@ def test_standard_parameters(
assert loader.web_path == web_path
assert loader.file_path != web_path
assert len(docs) == 1


def test_pymupdf_deprecated_kwards() -> None:
from langchain_community.document_loaders import PyMuPDFLoader

file_path = Path(__file__).parent.parent / "examples/hello.pdf"
loader = PyMuPDFLoader(file_path=file_path)
loader.load(sort=True)

0 comments on commit acf4358

Please sign in to comment.