Skip to content

Commit

Permalink
community[patch]: Refactoring PDF loaders: 01 prepare (#29062)
Browse files Browse the repository at this point in the history
- **Refactoring PDF loaders step 1**: "community: Refactoring PDF
loaders to standardize approaches"

- **Description:** Declare CloudBlobLoader in __init__.py. file_path is
Union[str, PurePath] anywhere
- **Twitter handle:** pprados

This is one part of a larger Pull Request (PR) that is too large to be
submitted all at once.
This specific part focuses to prepare the update of all parsers.

For more details, see [PR
28970](#28970).

@eyurtsev it's the start of a PR series.
  • Loading branch information
pprados authored Jan 7, 2025
1 parent a49448a commit 2921597
Showing 6 changed files with 90 additions and 85 deletions.
Original file line number Diff line number Diff line change
@@ -87,6 +87,7 @@
from langchain_community.document_loaders.blob_loaders import (
Blob,
BlobLoader,
CloudBlobLoader,
FileSystemBlobLoader,
YoutubeAudioLoader,
)
@@ -574,6 +575,7 @@
"CSVLoader": "langchain_community.document_loaders.csv_loader",
"CassandraLoader": "langchain_community.document_loaders.cassandra",
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
"CoNLLULoader": "langchain_community.document_loaders.conllu",
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
"ConcurrentLoader": "langchain_community.document_loaders.concurrent",
@@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any:
"CSVLoader",
"CassandraLoader",
"ChatGPTLoader",
"CloudBlobLoader",
"CoNLLULoader",
"CollegeConfidentialLoader",
"ConcurrentLoader",
34 changes: 14 additions & 20 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,6 @@
from typing import (
TYPE_CHECKING,
Any,
Dict,
Iterable,
Iterator,
Mapping,
@@ -23,15 +22,13 @@
from langchain_community.document_loaders.blob_loaders import Blob

if TYPE_CHECKING:
import fitz.fitz
import pdfminer.layout
import pdfplumber.page
import pypdf._page
import pypdfium2._helpers.page
from pypdf import PageObject
import fitz
import pdfminer
import pdfplumber
import pypdf
import pypdfium2
from textractor.data.text_linearization_config import TextLinearizationConfig


_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
_PDF_FILTER_WITHOUT_LOSS = [
"LZWDecode",
@@ -90,7 +87,7 @@ def __init__(
extract_images: bool = False,
*,
extraction_mode: str = "plain",
extraction_kwargs: Optional[Dict[str, Any]] = None,
extraction_kwargs: Optional[dict[str, Any]] = None,
):
self.password = password
self.extract_images = extract_images
@@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
"`pip install pypdf`"
)

def _extract_text_from_page(page: "PageObject") -> str:
def _extract_text_from_page(page: pypdf.PageObject) -> str:
"""
Extract text from image given the version of pypdf.
"""
@@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
Document(
page_content=_extract_text_from_page(page=page)
+ self._extract_images_from_page(page),
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
metadata={"source": blob.source, "page": page_number},
# type: ignore[attr-defined]
)
for page_number, page in enumerate(pdf_reader.pages)
]

def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
return ""
@@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
for page in doc
]

def _get_page_content(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
) -> str:
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
"""
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
if it is empty.
@@ -327,7 +323,7 @@ def _get_page_content(
return content

def _extract_metadata(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
self, doc: fitz.Document, page: fitz.Page, blob: Blob
) -> dict:
"""Extract metadata from the document and page."""
return dict(
@@ -344,9 +340,7 @@ def _extract_metadata(
},
)

def _extract_images_from_page(
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
) -> str:
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
"""Extract images from page and get the text with RapidOCR."""
if not self.extract_images:
return ""
@@ -558,7 +552,7 @@ def __init__(
textract_features: Optional[Sequence[int]] = None,
client: Optional[Any] = None,
*,
linearization_config: Optional["TextLinearizationConfig"] = None,
linearization_config: Optional[TextLinearizationConfig] = None,
) -> None:
"""Initializes the parser.
Loading

0 comments on commit 2921597

Please sign in to comment.