Skip to content

Commit

Permalink
Merge branch 'phu54321-feature-ocrthread'
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jan 29, 2024
2 parents e21cae1 + 2b92530 commit c48f970
Showing 1 changed file with 94 additions and 10 deletions.
104 changes: 94 additions & 10 deletions ocrmypdf_easyocr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,27 @@
from __future__ import annotations

import logging
import multiprocessing.managers
import os
from multiprocessing import Semaphore
import threading
import traceback
from pathlib import Path
from typing import Optional, Sequence, Tuple

import cv2 as cv
import easyocr
import numpy.typing as npt
import pluggy
from ocrmypdf import OcrEngine, hookimpl
from ocrmypdf import Executor, OcrEngine, PdfContext, hookimpl
from ocrmypdf._exec import tesseract
from ocrmypdf.builtin_plugins.optimize import optimize_pdf as default_optimize_pdf

from ocrmypdf_easyocr._cv import detect_skew
from ocrmypdf_easyocr._easyocr import tidy_easyocr_result
from ocrmypdf_easyocr._pdf import easyocr_to_pikepdf

log = logging.getLogger(__name__)

GPU_SEMAPHORE = Semaphore(3)

ISO_639_3_2: dict[str, str] = {
"afr": "af",
"alb": "sq",
Expand Down Expand Up @@ -89,16 +93,92 @@
"vie": "vi",
}

Task = Tuple[npt.NDArray, multiprocessing.Value, threading.Event] | None


def _ocr_process(q: multiprocessing.Queue[Task], options):
reader: Optional[easyocr.Reader] = None

while True:
message = q.get()
if message is None:
return # exit process
gray, output_dict, event = message

# Init reader on first OCR attempt: Wait until `options` variable is fully initialized.
# Note: `options` variable is on the same process with the main thread.
try:
if reader is None:
use_gpu = options.gpu
languages = [ISO_639_3_2[lang] for lang in options.languages]
reader = easyocr.Reader(languages, use_gpu)
output_dict["output"] = reader.readtext(
gray, batch_size=options.easyocr_batch_size
)
except Exception as e:
traceback.print_exception(e)
output_dict["output"] = ""
finally:
event.set()


@hookimpl
def initialize(plugin_manager: pluggy.PluginManager):
pass


class ProcessList:
def __init__(self, plist):
self.process_list = plist

def __getstate__(self):
return []


@hookimpl
def check_options(options):
m = multiprocessing.Manager()
q = multiprocessing.Queue(-1)
ocr_process_list = []
for _ in range(options.easyocr_workers):
t = multiprocessing.Process(target=_ocr_process, args=(q, options), daemon=True)
t.start()
ocr_process_list.append(t)

options._easyocr_struct = {"manager": m, "queue": q}
options._easyocr_plist = ProcessList(ocr_process_list)


@hookimpl
def optimize_pdf(
input_pdf: Path,
output_pdf: Path,
context: PdfContext,
executor: Executor,
linearize: bool,
) -> tuple[Path, Sequence[str]]:
options = context.options
for _ in range(options.easyocr_workers):
q = options._easyocr_struct["queue"]
q.put(None) # send stop message
for p in options._easyocr_plist.process_list:
p.join(3.0) # clean up child processes but don't wait forever

return default_optimize_pdf(
input_pdf=input_pdf,
output_pdf=output_pdf,
context=context,
executor=executor,
linearize=linearize,
)


@hookimpl
def add_options(parser):
easyocr_options = parser.add_argument_group("EasyOCR", "EasyOCR options")
easyocr_options.add_argument("--easyocr-no-gpu", action="store_false", dest="gpu")
easyocr_options.add_argument("--easyocr-batch-size", type=int, default=4)
easyocr_options.add_argument("--easyocr-workers", type=int, default=1)
easyocr_options.add_argument(
"--easyocr-debug-suppress-images",
action="store_true",
Expand Down Expand Up @@ -146,15 +226,19 @@ def generate_hocr(input_file, output_hocr, output_text, options):

@staticmethod
def generate_pdf(input_file, output_pdf, output_text, options):
languages = [ISO_639_3_2[lang] for lang in options.languages]

img = cv.imread(os.fspath(input_file))
gray = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
with GPU_SEMAPHORE:
reader = easyocr.Reader(languages, gpu=options.gpu)
raw_results = reader.readtext(gray)
results = [tidy_easyocr_result(r) for r in raw_results]

sync_data = options._easyocr_struct
manager: multiprocessing.managers.SyncManager = sync_data["manager"]
queue: multiprocessing.Queue[Task] = sync_data["queue"]
output_dict = manager.dict()
event = manager.Event()
queue.put((gray, output_dict, event))
event.wait()
raw_results = output_dict["output"]

results = [tidy_easyocr_result(r) for r in raw_results]
text = " ".join([result.text for result in results])
output_text.write_text(text)

Expand Down

0 comments on commit c48f970

Please sign in to comment.