Skip to content

Commit

Permalink
Merge branch 'main' of github.com:ocrmypdf/OCRmyPDF
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Mar 20, 2024
2 parents 8a747f0 + 855de28 commit f95aa63
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .docker/Dockerfile.alpine
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2023 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

FROM alpine:3.18 as base
FROM alpine:3.19 as base

ENV LANG=C.UTF-8
ENV TZ=UTC
Expand Down
55 changes: 45 additions & 10 deletions misc/batch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: 2016 findingorder <https://github.com/findingorder>
# SPDX-FileCopyrightText: 2024 nilsro <https://github.com/nilsro>
# SPDX-License-Identifier: MIT

"""Example of using ocrmypdf as a library in a script.
Expand All @@ -15,39 +16,73 @@

import logging
import sys
import os
import posixpath
import shutil
import filecmp
from pathlib import Path

import ocrmypdf

# pylint: disable=logging-format-interpolation
# pylint: disable=logging-not-lazy

def filecompare(a,b):
try:
return filecmp.cmp(a, b, shallow=True)
except FileNotFoundError:
return False


script_dir = Path(__file__).parent
# set archive_dir to a path for backup original documents. Leave empty if not required.
archive_dir = "/pdfbak"

if len(sys.argv) > 1:
start_dir = Path(sys.argv[1])
else:
start_dir = Path('.')
start_dir = Path(".")

if len(sys.argv) > 2:
log_file = Path(sys.argv[2])
else:
log_file = script_dir.with_name('ocr-tree.log')
log_file = script_dir.with_name("ocr-tree.log")

logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(message)s',
format="%(asctime)s %(message)s",
filename=log_file,
filemode='a',
filemode="a",
)

logging.info(f"Start directory {start_dir}")

ocrmypdf.configure_logging(ocrmypdf.Verbosity.default)

for filename in start_dir.glob("**/*.py"):
for filename in start_dir.glob("**/*.pdf"):
logging.info(f"Processing {filename}")
result = ocrmypdf.ocr(filename, filename, deskew=True)
if result == ocrmypdf.ExitCode.already_done_ocr:
logging.error("Skipped document because it already contained text")
elif result == ocrmypdf.ExitCode.ok:
if ocrmypdf.pdfa.file_claims_pdfa(filename)["pass"]:
logging.info("Skipped document because it already contained text")
else:
archive_filename = archive_dir + str(filename)
if len(archive_dir) > 0 and not filecompare(filename, archive_filename):
logging.info(f"Archiving document to {archive_filename}")
try:
shutil.copy2(filename, posixpath.dirname(archive_filename))
except IOError as io_err:
os.makedirs(posixpath.dirname(archive_filename))
shutil.copy2(filename, posixpath.dirname(archive_filename))
try:
result = ocrmypdf.ocr(filename, filename, deskew=True)
logging.info(result)
except ocrmypdf.exceptions.EncryptedPdfError:
logging.info("Skipped document because it is encrypted")
except ocrmypdf.exceptions.PriorOcrFoundError:
logging.info("Skipped document because it already contained text")
except ocrmypdf.exceptions.DigitalSignatureError:
logging.info("Skipped document because it has a digital signature")
except ocrmypdf.exceptions.TaggedPDFError:
logging.info("Skipped document because it does not need ocr as it is tagged")
except:
logging.error("Unhandled error occured")
logging.info("OCR complete")
logging.info(result)
9 changes: 7 additions & 2 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
from PIL import Image

from ocrmypdf._exec import ghostscript, tesseract
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.helpers import Resolution
from ocrmypdf.pdfinfo import PdfInfo

from .conftest import check_ocrmypdf, have_unpaper
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf

RENDERERS = ['hocr', 'sandwich']

Expand Down Expand Up @@ -107,14 +108,18 @@ def test_non_square_resolution(renderer, resources, outpdf):
in_pageinfo = PdfInfo(resources / 'aspect.pdf')
assert in_pageinfo[0].dpi.x != in_pageinfo[0].dpi.y

check_ocrmypdf(
proc = run_ocrmypdf(
resources / 'aspect.pdf',
outpdf,
'--pdf-renderer',
renderer,
'--plugin',
'tests/plugins/tesseract_cache.py',
)
# PDF/A conversion can fail for this file if Ghostscript >= 10.3, so don't test
# exit code in that case
if proc.returncode != ExitCode.pdfa_conversion_failed:
proc.check_returncode()

out_pageinfo = PdfInfo(outpdf)

Expand Down

0 comments on commit f95aa63

Please sign in to comment.