diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py index 462e20d357904..50c9fde29d918 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_pdf.py +++ b/libs/community/tests/integration_tests/document_loaders/test_pdf.py @@ -1,4 +1,3 @@ -import re from pathlib import Path from typing import Sequence, Union @@ -11,7 +10,6 @@ PDFMinerPDFasHTMLLoader, PyMuPDFLoader, PyPDFium2Loader, - PyPDFLoader, UnstructuredPDFLoader, ) @@ -86,37 +84,6 @@ def test_pdfminer_pdf_as_html_loader() -> None: assert len(docs) == 1 -def test_pypdf_loader() -> None: - """Test PyPDFLoader.""" - file_path = Path(__file__).parent.parent / "examples/hello.pdf" - loader = PyPDFLoader(str(file_path)) - docs = loader.load() - - assert len(docs) == 1 - - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path)) - - docs = loader.load() - assert len(docs) == 16 - - -def test_pypdf_loader_with_layout() -> None: - """Test PyPDFLoader with layout mode.""" - file_path = Path(__file__).parent.parent / "examples/layout-parser-paper.pdf" - loader = PyPDFLoader(str(file_path), extraction_mode="layout") - - docs = loader.load() - first_page = docs[0].page_content - - expected = ( - Path(__file__).parent.parent / "examples/layout-parser-paper-page-1.txt" - ).read_text(encoding="utf-8") - cleaned_first_page = re.sub(r"\x00", "", first_page) - cleaned_expected = re.sub(r"\x00", "", expected) - assert cleaned_first_page == cleaned_expected - - def test_pypdfium2_loader() -> None: """Test PyPDFium2Loader.""" file_path = Path(__file__).parent.parent / "examples/hello.pdf" diff --git a/libs/community/tests/unit_tests/document_loaders/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/test_pdf.py new file mode 100644 index 0000000000000..d62363723bd60 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/test_pdf.py @@ -0,0 +1,46 @@ +import re +from pathlib import Path + +import pytest + +from langchain_community.document_loaders import PyPDFLoader + +path_to_simple_pdf = ( + Path(__file__).parent.parent.parent / "integration_tests/examples/hello.pdf" +) +path_to_layout_pdf = ( + Path(__file__).parent.parent + / "document_loaders/sample_documents/layout-parser-paper.pdf" +) +path_to_layout_pdf_txt = ( + Path(__file__).parent.parent.parent + / "integration_tests/examples/layout-parser-paper-page-1.txt" +) + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader() -> None: + """Test PyPDFLoader.""" + loader = PyPDFLoader(str(path_to_simple_pdf)) + docs = loader.load() + + assert len(docs) == 1 + + loader = PyPDFLoader(str(path_to_layout_pdf)) + + docs = loader.load() + assert len(docs) == 16 + + +@pytest.mark.requires("pypdf") +def test_pypdf_loader_with_layout() -> None: + """Test PyPDFLoader with layout mode.""" + loader = PyPDFLoader(str(path_to_layout_pdf), extraction_mode="layout") + + docs = loader.load() + first_page = docs[0].page_content + + expected = path_to_layout_pdf_txt.read_text(encoding="utf-8") + cleaned_first_page = re.sub(r"\x00", "", first_page) + cleaned_expected = re.sub(r"\x00", "", expected) + assert cleaned_first_page == cleaned_expected