Skip to content

Commit

Permalink
fix!: make PyPDFToDocument JSON-serializable (deepset-ai#6396)
Browse files Browse the repository at this point in the history
* add registry

* release not

* add checks

* rm superflous check

* fix typo

* rm print :-)
  • Loading branch information
anakin87 authored Nov 23, 2023
1 parent a492771 commit b0b5147
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
32 changes: 25 additions & 7 deletions haystack/preview/components/converters/pypdf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import io
import logging
from typing import List, Union, Optional, Protocol
from typing import List, Union, Protocol, Dict
from pathlib import Path

from haystack.preview.dataclasses import ByteStream
from haystack.preview.lazy_imports import LazyImport
from haystack.preview import Document, component
from haystack.preview import Document, component, default_to_dict

with LazyImport("Run 'pip install pypdf'") as pypdf_import:
from pypdf import PdfReader
Expand Down Expand Up @@ -34,6 +34,11 @@ def convert(self, reader: "PdfReader") -> Document:
return Document(content=text)


# This registry is used to store converters names and instances.
# It can be used to register custom converters.
CONVERTERS_REGISTRY: Dict[str, PyPDFConverter] = {"default": DefaultConverter()}


@component
class PyPDFToDocument:
"""
Expand All @@ -42,14 +47,27 @@ class PyPDFToDocument:
A default text extraction converter is used if no custom converter is provided.
"""

def __init__(self, converter: Optional[PyPDFConverter] = None):
def __init__(self, converter_name: str = "default"):
"""
Initializes the PyPDFToDocument component with an optional custom converter.
:param converter: A converter instance that adheres to the PyPDFConverter protocol.
If None, the DefaultConverter is used.
:param converter_name: A converter name that is registered in the CONVERTERS_REGISTRY.
Defaults to 'default'.
"""
pypdf_import.check()
self.converter: PyPDFConverter = converter or DefaultConverter()

try:
converter = CONVERTERS_REGISTRY[converter_name]
except KeyError:
msg = (
f"Invalid converter_name: {converter_name}.\n Available converters: {list(CONVERTERS_REGISTRY.keys())}"
)
raise ValueError(msg) from KeyError
self.converter_name = converter_name
self._converter: PyPDFConverter = converter

def to_dict(self):
# do not serialize the _converter instance
return default_to_dict(self, converter_name=self.converter_name)

@component.output_types(documents=List[Document])
def run(self, sources: List[Union[str, Path, ByteStream]]):
Expand All @@ -63,7 +81,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]):
for source in sources:
try:
pdf_reader = self._get_pdf_reader(source)
document = self.converter.convert(pdf_reader)
document = self._converter.convert(pdf_reader)
except Exception as e:
logger.warning("Could not read %s and convert it to Document, skipping. %s", source, e)
continue
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
preview:
- |
Make `PyPDFToDocument` accept a `converter_name` parameter instead of
a `converter` instance, to allow a smooth serialization of the component.
15 changes: 13 additions & 2 deletions test/preview/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,20 @@
from pypdf import PdfReader

from haystack.preview import Document
from haystack.preview.components.converters.pypdf import PyPDFToDocument
from haystack.preview.components.converters.pypdf import PyPDFToDocument, CONVERTERS_REGISTRY
from haystack.preview.dataclasses import ByteStream


class TestPyPDFToDocument:
def test_init(self):
component = PyPDFToDocument()
assert component.converter_name == "default"
assert hasattr(component, "_converter")

def test_init_fail_nonexisting_converter(self):
with pytest.raises(ValueError):
PyPDFToDocument(converter_name="non_existing_converter")

@pytest.mark.unit
def test_run(self, preview_samples_path):
"""
Expand Down Expand Up @@ -58,7 +67,9 @@ class MyCustomConverter:
def convert(self, reader: PdfReader) -> Document:
return Document(content="I don't care about converting given pdfs, I always return this")

converter = PyPDFToDocument(converter=MyCustomConverter())
CONVERTERS_REGISTRY["custom"] = MyCustomConverter()

converter = PyPDFToDocument(converter_name="custom")
output = converter.run(sources=paths)
docs = output["documents"]
assert len(docs) == 1
Expand Down

0 comments on commit b0b5147

Please sign in to comment.