Merge branch 'main' of github.com:ocrmypdf/OCRmyPDF

ocrmypdf · Mar 20, 2024 · f95aa63 · f95aa63
2 parents 8a747f0 + 855de28
commit f95aa63
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 13 deletions.
diff --git a/.docker/Dockerfile.alpine b/.docker/Dockerfile.alpine
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: 2023 James R. Barlow
 # SPDX-License-Identifier: MPL-2.0
 
-FROM alpine:3.18 as base
+FROM alpine:3.19 as base
 
 ENV LANG=C.UTF-8
 ENV TZ=UTC

diff --git a/misc/batch.py b/misc/batch.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 # SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
+# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>
 # SPDX-License-Identifier: MIT
 
 """Example of using ocrmypdf as a library in a script.
@@ -15,39 +16,73 @@
 
 import logging
 import sys
+import os
+import posixpath
+import shutil
+import filecmp
 from pathlib import Path
 
 import ocrmypdf
 
 # pylint: disable=logging-format-interpolation
 # pylint: disable=logging-not-lazy
 
+def filecompare(a,b):
+    try:
+        return filecmp.cmp(a, b, shallow=True)
+    except FileNotFoundError:
+        return False
+
+
 script_dir = Path(__file__).parent
+# set archive_dir to a path for backup original documents. Leave empty if not required.
+archive_dir = "/pdfbak"
 
 if len(sys.argv) > 1:
     start_dir = Path(sys.argv[1])
 else:
-    start_dir = Path('.')
+    start_dir = Path(".")
 
 if len(sys.argv) > 2:
     log_file = Path(sys.argv[2])
 else:
-    log_file = script_dir.with_name('ocr-tree.log')
+    log_file = script_dir.with_name("ocr-tree.log")
 
 logging.basicConfig(
     level=logging.INFO,
-    format='%(asctime)s %(message)s',
+    format="%(asctime)s %(message)s",
     filename=log_file,
-    filemode='a',
+    filemode="a",
 )
 
+logging.info(f"Start directory {start_dir}")
+
 ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)
 
-for filename in start_dir.glob("**/*.py"):
+for filename in start_dir.glob("**/*.pdf"):
     logging.info(f"Processing {filename}")
-    result = ocrmypdf.ocr(filename, filename, deskew=True)
-    if result == ocrmypdf.ExitCode.already_done_ocr:
-        logging.error("Skipped document because it already contained text")
-    elif result == ocrmypdf.ExitCode.ok:
+    if ocrmypdf.pdfa.file_claims_pdfa(filename)["pass"]:
+        logging.info("Skipped document because it already contained text")
+    else:
+        archive_filename = archive_dir + str(filename)
+        if len(archive_dir) > 0 and not filecompare(filename, archive_filename):
+            logging.info(f"Archiving document to {archive_filename}")
+            try:
+                shutil.copy2(filename, posixpath.dirname(archive_filename))
+            except IOError as io_err:
+                os.makedirs(posixpath.dirname(archive_filename))
+                shutil.copy2(filename, posixpath.dirname(archive_filename))
+        try:
+            result = ocrmypdf.ocr(filename, filename, deskew=True)
+            logging.info(result)
+        except ocrmypdf.exceptions.EncryptedPdfError:
+            logging.info("Skipped document because it is encrypted")
+        except ocrmypdf.exceptions.PriorOcrFoundError:
+            logging.info("Skipped document because it already contained text")
+        except ocrmypdf.exceptions.DigitalSignatureError:
+            logging.info("Skipped document because it has a digital signature")
+        except ocrmypdf.exceptions.TaggedPDFError:
+            logging.info("Skipped document because it does not need ocr as it is tagged")
+        except:
+            logging.error("Unhandled error occured")
         logging.info("OCR complete")
-    logging.info(result)
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -9,10 +9,11 @@
 from PIL import Image
 
 from ocrmypdf._exec import ghostscript, tesseract
+from ocrmypdf.exceptions import ExitCode
 from ocrmypdf.helpers import Resolution
 from ocrmypdf.pdfinfo import PdfInfo
 
-from .conftest import check_ocrmypdf, have_unpaper
+from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf
 
 RENDERERS = ['hocr', 'sandwich']
 
@@ -107,14 +108,18 @@ def test_non_square_resolution(renderer, resources, outpdf):
     in_pageinfo = PdfInfo(resources / 'aspect.pdf')
     assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y
 
-    check_ocrmypdf(
+    proc = run_ocrmypdf(
         resources / 'aspect.pdf',
         outpdf,
         '--pdf-renderer',
         renderer,
         '--plugin',
         'tests/plugins/tesseract_cache.py',
     )
+    # PDF/A conversion can fail for this file if Ghostscript >= 10.3, so don't test
+    # exit code in that case
+    if proc.returncode != ExitCode.pdfa_conversion_failed:
+        proc.check_returncode()
 
     out_pageinfo = PdfInfo(outpdf)