community[patch]: Refactoring PDF loaders: 01 prepare (#29062)

- **Refactoring PDF loaders step 1**: "community: Refactoring PDF loaders to standardize approaches" - **Description:** Declare CloudBlobLoader in __init__.py. file_path is Union[str, PurePath] anywhere - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](#28970). @eyurtsev it's the start of a PR series.
langchain-ai · Jan 7, 2025 · 2921597 · 2921597
1 parent a49448a
commit 2921597
Showing 6 changed files with 90 additions and 85 deletions.
diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py
@@ -87,6 +87,7 @@
     from langchain_community.document_loaders.blob_loaders import (
         Blob,
         BlobLoader,
+        CloudBlobLoader,
         FileSystemBlobLoader,
         YoutubeAudioLoader,
     )
@@ -574,6 +575,7 @@
     "CSVLoader": "langchain_community.document_loaders.csv_loader",
     "CassandraLoader": "langchain_community.document_loaders.cassandra",
     "ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
+    "CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
     "CoNLLULoader": "langchain_community.document_loaders.conllu",
     "CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential",  # noqa: E501
     "ConcurrentLoader": "langchain_community.document_loaders.concurrent",
@@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any:
     "CSVLoader",
     "CassandraLoader",
     "ChatGPTLoader",
+    "CloudBlobLoader",
     "CoNLLULoader",
     "CollegeConfidentialLoader",
     "ConcurrentLoader",

diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -6,7 +6,6 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
     Iterable,
     Iterator,
     Mapping,
@@ -23,15 +22,13 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 
 if TYPE_CHECKING:
-    import fitz.fitz
-    import pdfminer.layout
-    import pdfplumber.page
-    import pypdf._page
-    import pypdfium2._helpers.page
-    from pypdf import PageObject
+    import fitz
+    import pdfminer
+    import pdfplumber
+    import pypdf
+    import pypdfium2
     from textractor.data.text_linearization_config import TextLinearizationConfig
 
-
 _PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
 _PDF_FILTER_WITHOUT_LOSS = [
     "LZWDecode",
@@ -90,7 +87,7 @@ def __init__(
         extract_images: bool = False,
         *,
         extraction_mode: str = "plain",
-        extraction_kwargs: Optional[Dict[str, Any]] = None,
+        extraction_kwargs: Optional[dict[str, Any]] = None,
     ):
         self.password = password
         self.extract_images = extract_images
@@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 "`pip install pypdf`"
             )
 
-        def _extract_text_from_page(page: "PageObject") -> str:
+        def _extract_text_from_page(page: pypdf.PageObject) -> str:
             """
             Extract text from image given the version of pypdf.
             """
@@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
                 Document(
                     page_content=_extract_text_from_page(page=page)
                     + self._extract_images_from_page(page),
-                    metadata={"source": blob.source, "page": page_number},  # type: ignore[attr-defined]
+                    metadata={"source": blob.source, "page": page_number},
+                    # type: ignore[attr-defined]
                 )
                 for page_number, page in enumerate(pdf_reader.pages)
             ]
 
-    def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
+    def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
         """Extract images from page and get the text with RapidOCR."""
         if not self.extract_images or "/XObject" not in page["/Resources"].keys():  # type: ignore[attr-defined]
             return ""
@@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:  # type: ignore[valid-ty
                 for page in doc
             ]
 
-    def _get_page_content(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
-    ) -> str:
+    def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
         """
         Get the text of the page using PyMuPDF and RapidOCR and issue a warning
         if it is empty.
@@ -327,7 +323,7 @@ def _get_page_content(
         return content
 
     def _extract_metadata(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
+        self, doc: fitz.Document, page: fitz.Page, blob: Blob
     ) -> dict:
         """Extract metadata from the document and page."""
         return dict(
@@ -344,9 +340,7 @@ def _extract_metadata(
             },
         )
 
-    def _extract_images_from_page(
-        self, doc: fitz.fitz.Document, page: fitz.fitz.Page
-    ) -> str:
+    def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
         """Extract images from page and get the text with RapidOCR."""
         if not self.extract_images:
             return ""
@@ -558,7 +552,7 @@ def __init__(
         textract_features: Optional[Sequence[int]] = None,
         client: Optional[Any] = None,
         *,
-        linearization_config: Optional["TextLinearizationConfig"] = None,
+        linearization_config: Optional[TextLinearizationConfig] = None,
     ) -> None:
         """Initializes the parser.