Skip to content

Commit

Permalink
Add documentation to PdfMinerState
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Oct 28, 2024
1 parent f77f701 commit d303b42
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion src/ocrmypdf/pdfinfo/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,19 @@ def get_page_analysis(


class PdfMinerState:
def __init__(self, infile: Path, pscript5_mode: bool):
"""Provide a context manager for using pdfminer.six.
This ensures that the file is closed. It also provides a cache of pages
from the PDF so that they can be reused if needed, to improve performance.
"""

def __init__(self, infile: Path, pscript5_mode: bool) -> None:
"""Initialize the context manager.
Args:
infile: The path to the PDF file to be analyzed.
pscript5_mode: Whether the PDF was generated by PScript5.dll.
"""
self.infile = infile
self.rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
self.disable_boxes_flow = None
Expand All @@ -331,15 +343,18 @@ def __init__(self, infile: Path, pscript5_mode: bool):
self.file = None

def __enter__(self):
"""Enter the context manager."""
self.file = Path(self.infile).open('rb')
return self

def __exit__(self, exc_type, exc_value, traceback):
"""Exit the context manager."""
if self.file:
self.file.close()
return True

def _load_page_cache(self):
"""Load the page cache."""
try:
self.page_cache = list(PDFPage.get_pages(self.file))
if not self.page_cache:
Expand All @@ -355,6 +370,7 @@ def _load_page_cache(self):
raise EncryptedPdfError() from e

def get_page_analysis(self, pageno: int):
"""Get the page analysis for a given page."""
if not self.page_cache:
self._load_page_cache()
page = self.page_cache[pageno]
Expand Down

0 comments on commit d303b42

Please sign in to comment.