Skip to content

Commit

Permalink
batch example: added archive, small corrections and optimizations (#1277
Browse files Browse the repository at this point in the history
)

* Added archive, small corrections

Added a function to archive originals and avoid calling ocrmypdf if they are still is PDF/A.

* Added Copyright
  • Loading branch information
NilsRo authored Mar 18, 2024
1 parent e7eb8fa commit feeb9f2
Showing 1 changed file with 45 additions and 10 deletions.
55 changes: 45 additions & 10 deletions misc/batch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>
# SPDX-License-Identifier: MIT

"""Example of using ocrmypdf as a library in a script.
Expand All @@ -15,39 +16,73 @@

import logging
import sys
import os
import posixpath
import shutil
import filecmp
from pathlib import Path

import ocrmypdf

# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy

def filecompare(a,b):
try:
return filecmp.cmp(a, b, shallow=True)
except FileNotFoundError:
return False


script_dir = Path(__file__).parent
# set archive_dir to a path for backup original documents. Leave empty if not required.
archive_dir = "/pdfbak"

if len(sys.argv) > 1:
start_dir = Path(sys.argv[1])
else:
start_dir = Path('.')
start_dir = Path(".")

if len(sys.argv) > 2:
log_file = Path(sys.argv[2])
else:
log_file = script_dir.with_name('ocr-tree.log')
log_file = script_dir.with_name("ocr-tree.log")

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(message)s',
format="%(asctime)s %(message)s",
filename=log_file,
filemode='a',
filemode="a",
)

logging.info(f"Start directory {start_dir}")

ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

for filename in start_dir.glob("**/*.py"):
for filename in start_dir.glob("**/*.pdf"):
logging.info(f"Processing {filename}")
result = ocrmypdf.ocr(filename, filename, deskew=True)
if result == ocrmypdf.ExitCode.already_done_ocr:
logging.error("Skipped document because it already contained text")
elif result == ocrmypdf.ExitCode.ok:
if ocrmypdf.pdfa.file_claims_pdfa(filename)["pass"]:
logging.info("Skipped document because it already contained text")
else:
archive_filename = archive_dir + str(filename)
if len(archive_dir) > 0 and not filecompare(filename, archive_filename):
logging.info(f"Archiving document to {archive_filename}")
try:
shutil.copy2(filename, posixpath.dirname(archive_filename))
except IOError as io_err:
os.makedirs(posixpath.dirname(archive_filename))
shutil.copy2(filename, posixpath.dirname(archive_filename))
try:
result = ocrmypdf.ocr(filename, filename, deskew=True)
logging.info(result)
except ocrmypdf.exceptions.EncryptedPdfError:
logging.info("Skipped document because it is encrypted")
except ocrmypdf.exceptions.PriorOcrFoundError:
logging.info("Skipped document because it already contained text")
except ocrmypdf.exceptions.DigitalSignatureError:
logging.info("Skipped document because it has a digital signature")
except ocrmypdf.exceptions.TaggedPDFError:
logging.info("Skipped document because it does not need ocr as it is tagged")
except:
logging.error("Unhandled error occured")
logging.info("OCR complete")
logging.info(result)

0 comments on commit feeb9f2

Please sign in to comment.