From d303b42c8610d3bec0be19b089008ece821cbe9e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sun, 27 Oct 2024 22:01:36 -0700 Subject: [PATCH] Add documentation to PdfMinerState --- src/ocrmypdf/pdfinfo/layout.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/ocrmypdf/pdfinfo/layout.py b/src/ocrmypdf/pdfinfo/layout.py index 679662a69..7660ab52e 100644 --- a/src/ocrmypdf/pdfinfo/layout.py +++ b/src/ocrmypdf/pdfinfo/layout.py @@ -322,7 +322,19 @@ def get_page_analysis( class PdfMinerState: - def __init__(self, infile: Path, pscript5_mode: bool): + """Provide a context manager for using pdfminer.six. + + This ensures that the file is closed. It also provides a cache of pages + from the PDF so that they can be reused if needed, to improve performance. + """ + + def __init__(self, infile: Path, pscript5_mode: bool) -> None: + """Initialize the context manager. + + Args: + infile: The path to the PDF file to be analyzed. + pscript5_mode: Whether the PDF was generated by PScript5.dll. + """ self.infile = infile self.rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) self.disable_boxes_flow = None @@ -331,15 +343,18 @@ def __init__(self, infile: Path, pscript5_mode: bool): self.file = None def __enter__(self): + """Enter the context manager.""" self.file = Path(self.infile).open('rb') return self def __exit__(self, exc_type, exc_value, traceback): + """Exit the context manager.""" if self.file: self.file.close() return True def _load_page_cache(self): + """Load the page cache.""" try: self.page_cache = list(PDFPage.get_pages(self.file)) if not self.page_cache: @@ -355,6 +370,7 @@ def _load_page_cache(self): raise EncryptedPdfError() from e def get_page_analysis(self, pageno: int): + """Get the page analysis for a given page.""" if not self.page_cache: self._load_page_cache() page = self.page_cache[pageno]