diff --git a/external/__init__.py b/external/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/external/pytesseract/__init__.py b/external/pytesseract/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/external/pytesseract/pytesseract.py b/external/pytesseract/pytesseract.py new file mode 100644 index 0000000..8567202 --- /dev/null +++ b/external/pytesseract/pytesseract.py @@ -0,0 +1,587 @@ +#!/usr/bin/env python +import re +import shlex +import string +import subprocess +import sys +from contextlib import contextmanager +from csv import QUOTE_NONE +from errno import ENOENT +from functools import wraps +from glob import iglob +from io import BytesIO +from os import environ +from os import extsep +from os import linesep +from os import remove +from os.path import normcase +from os.path import normpath +from os.path import realpath +from pkgutil import find_loader +from tempfile import NamedTemporaryFile +from time import sleep + +from packaging.version import InvalidVersion +from packaging.version import parse +from packaging.version import Version +from PIL import Image + + +tesseract_cmd = 'tesseract' + +numpy_installed = find_loader('numpy') is not None +if numpy_installed: + from numpy import ndarray + +pandas_installed = find_loader('pandas') is not None +if pandas_installed: + import pandas as pd + +DEFAULT_ENCODING = 'utf-8' +LANG_PATTERN = re.compile('^[a-z_]+$') +RGB_MODE = 'RGB' +SUPPORTED_FORMATS = { + 'JPEG', + 'JPEG2000', + 'PNG', + 'PBM', + 'PGM', + 'PPM', + 'TIFF', + 'BMP', + 'GIF', + 'WEBP', +} + +OSD_KEYS = { + 'Page number': ('page_num', int), + 'Orientation in degrees': ('orientation', int), + 'Rotate': ('rotate', int), + 'Orientation confidence': ('orientation_conf', float), + 'Script': ('script', str), + 'Script confidence': ('script_conf', float), +} + +TESSERACT_MIN_VERSION = Version('3.05') +TESSERACT_ALTO_VERSION = Version('4.1.0') + + +class Output: + BYTES = 'bytes' + DATAFRAME = 'data.frame' + DICT = 'dict' + STRING = 'string' + + +class PandasNotSupported(EnvironmentError): + def __init__(self): + super().__init__('Missing pandas package') + + +class TesseractError(RuntimeError): + def __init__(self, status, message): + self.status = status + self.message = message + self.args = (status, message) + + +class TesseractNotFoundError(EnvironmentError): + def __init__(self): + super().__init__( + f"{tesseract_cmd} is not installed or it's not in your PATH." + f' See README file for more information.', + ) + + +class TSVNotSupported(EnvironmentError): + def __init__(self): + super().__init__( + 'TSV output not supported. Tesseract >= 3.05 required', + ) + + +class ALTONotSupported(EnvironmentError): + def __init__(self): + super().__init__( + 'ALTO output not supported. Tesseract >= 4.1.0 required', + ) + + +def kill(process, code): + process.terminate() + try: + process.wait(1) + except TypeError: # python2 Popen.wait(1) fallback + sleep(1) + except Exception: # python3 subprocess.TimeoutExpired + pass + finally: + process.kill() + process.returncode = code + + +@contextmanager +def timeout_manager(proc, seconds=None): + try: + if not seconds: + yield proc.communicate()[1] + return + + try: + _, error_string = proc.communicate(timeout=seconds) + yield error_string + except subprocess.TimeoutExpired: + kill(proc, -1) + raise RuntimeError('Tesseract process timeout') + finally: + proc.stdin.close() + proc.stdout.close() + proc.stderr.close() + + +def run_once(func): + @wraps(func) + def wrapper(*args, **kwargs): + if wrapper._result is wrapper: + wrapper._result = func(*args, **kwargs) + return wrapper._result + + wrapper._result = wrapper + return wrapper + + +def get_errors(error_string): + return ' '.join( + line for line in error_string.decode(DEFAULT_ENCODING).splitlines() + ).strip() + + +def cleanup(temp_name): + """Tries to remove temp files by filename wildcard path.""" + for filename in iglob(f'{temp_name}*' if temp_name else temp_name): + try: + remove(filename) + except OSError as e: + if e.errno != ENOENT: + raise + + +def prepare(image): + if numpy_installed and isinstance(image, ndarray): + image = Image.fromarray(image) + + if not isinstance(image, Image.Image): + raise TypeError('Unsupported image object') + + extension = 'PNG' if not image.format else image.format + if extension not in SUPPORTED_FORMATS: + raise TypeError('Unsupported image format/type') + + if 'A' in image.getbands(): + # discard and replace the alpha channel with white background + background = Image.new(RGB_MODE, image.size, (255, 255, 255)) + background.paste(image, (0, 0), image.getchannel('A')) + image = background + + image.format = extension + return image, extension + + +@contextmanager +def save(image): + try: + with NamedTemporaryFile(prefix='tess_', delete=False) as f: + if isinstance(image, str): + yield f.name, realpath(normpath(normcase(image))) + return + image, extension = prepare(image) + input_file_name = f'{f.name}_input{extsep}{extension}' + image.save(input_file_name, format=image.format) + yield f.name, input_file_name + finally: + cleanup(f.name) + + +def subprocess_args(include_stdout=True): + # See https://github.com/pyinstaller/pyinstaller/wiki/Recipe-subprocess + # for reference and comments. + + kwargs = { + 'stdin': subprocess.PIPE, + 'stderr': subprocess.PIPE, + 'startupinfo': None, + 'env': environ, + } + + if hasattr(subprocess, 'STARTUPINFO'): + kwargs['startupinfo'] = subprocess.STARTUPINFO() + kwargs['startupinfo'].dwFlags |= subprocess.STARTF_USESHOWWINDOW + kwargs['startupinfo'].wShowWindow = subprocess.SW_HIDE + + if include_stdout: + kwargs['stdout'] = subprocess.PIPE + else: + kwargs['stdout'] = subprocess.DEVNULL + + return kwargs + + +def run_tesseract( + input_filename, + output_filename_base, + extension, + lang, + config='', + nice=0, + timeout=0, +): + cmd_args = [] + + if not sys.platform.startswith('win32') and nice != 0: + cmd_args += ('nice', '-n', str(nice)) + + cmd_args += (tesseract_cmd, input_filename, output_filename_base) + + if lang is not None: + cmd_args += ('-l', lang) + + if config: + cmd_args += shlex.split(config) + + if extension and extension not in {'box', 'osd', 'tsv', 'xml'}: + cmd_args.append(extension) + + try: + proc = subprocess.Popen(cmd_args, **subprocess_args()) + except OSError as e: + if e.errno != ENOENT: + raise + else: + raise TesseractNotFoundError() + + with timeout_manager(proc, timeout) as error_string: + if proc.returncode: + raise TesseractError(proc.returncode, get_errors(error_string)) + + +def run_and_get_output( + image, + extension='', + lang=None, + config='', + nice=0, + timeout=0, + return_bytes=False, +): + + cmd_args = [tesseract_cmd, 'stdin', 'stdout'] + + if not sys.platform.startswith('win32') and nice != 0: + cmd_args += ('nice', '-n', str(nice)) + + if lang is not None: + cmd_args += ('-l', lang) + + if config: + cmd_args += shlex.split(config) + + try: + proc = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE) + image.save(proc.stdin, 'PNG') + (stdout_data, stderr_data) = proc.communicate() + return stdout_data.decode(DEFAULT_ENCODING) + except OSError as e: + if e.errno != ENOENT: + raise e + raise TesseractNotFoundError() + + with timeout_manager(proc, timeout) as error_string: + if proc.returncode: + raise TesseractError(proc.returncode, get_errors(error_string)) + + +def file_to_dict(tsv, cell_delimiter, str_col_idx): + result = {} + rows = [row.split(cell_delimiter) for row in tsv.strip().split('\n')] + if len(rows) < 2: + return result + + header = rows.pop(0) + length = len(header) + if len(rows[-1]) < length: + # Fixes bug that occurs when last text string in TSV is null, and + # last row is missing a final cell in TSV file + rows[-1].append('') + + if str_col_idx < 0: + str_col_idx += length + + for i, head in enumerate(header): + result[head] = list() + for row in rows: + if len(row) <= i: + continue + + if i != str_col_idx: + try: + val = int(float(row[i])) + except ValueError: + val = row[i] + else: + val = row[i] + + result[head].append(val) + + return result + + +def is_valid(val, _type): + if _type is int: + return val.isdigit() + + if _type is float: + try: + float(val) + return True + except ValueError: + return False + + return True + + +def osd_to_dict(osd): + return { + OSD_KEYS[kv[0]][0]: OSD_KEYS[kv[0]][1](kv[1]) + for kv in (line.split(': ') for line in osd.split('\n')) + if len(kv) == 2 and is_valid(kv[1], OSD_KEYS[kv[0]][1]) + } + + +@run_once +def get_languages(config=''): + cmd_args = [tesseract_cmd, '--list-langs'] + if config: + cmd_args += shlex.split(config) + + try: + result = subprocess.run( + cmd_args, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + except OSError: + raise TesseractNotFoundError() + + # tesseract 3.x + if result.returncode not in (0, 1): + raise TesseractNotFoundError() + + languages = [] + if result.stdout: + for line in result.stdout.decode(DEFAULT_ENCODING).split(linesep): + lang = line.strip() + if LANG_PATTERN.match(lang): + languages.append(lang) + + return languages + + +@run_once +def get_tesseract_version(): + """ + Returns Version object of the Tesseract version + """ + try: + output = subprocess.check_output( + [tesseract_cmd, '--version'], + stderr=subprocess.STDOUT, + env=environ, + stdin=subprocess.DEVNULL, + ) + except OSError: + raise TesseractNotFoundError() + + raw_version = output.decode(DEFAULT_ENCODING) + str_version, *_ = raw_version.lstrip(string.printable[10:]).partition(' ') + str_version, *_ = str_version.partition('-') + + try: + version = parse(str_version) + assert version >= TESSERACT_MIN_VERSION + except (AssertionError, InvalidVersion): + raise SystemExit(f'Invalid tesseract version: "{raw_version}"') + + return version + + +def image_to_string( + image, + lang=None, + config='', + nice=0, + output_type=Output.STRING, + timeout=0, +): + """ + Returns the result of a Tesseract OCR run on the provided image to string + """ + args = [image, 'txt', lang, config, nice, timeout] + + return { + Output.BYTES: lambda: run_and_get_output(*(args + [True])), + Output.DICT: lambda: {'text': run_and_get_output(*args)}, + Output.STRING: lambda: run_and_get_output(*args), + }[output_type]() + + +def image_to_pdf_or_hocr( + image, + lang=None, + config='', + nice=0, + extension='pdf', + timeout=0, +): + """ + Returns the result of a Tesseract OCR run on the provided image to pdf/hocr + """ + + if extension not in {'pdf', 'hocr'}: + raise ValueError(f'Unsupported extension: {extension}') + args = [image, extension, lang, config, nice, timeout, True] + + return run_and_get_output(*args) + + +def image_to_alto_xml( + image, + lang=None, + config='', + nice=0, + timeout=0, +): + """ + Returns the result of a Tesseract OCR run on the provided image to ALTO XML + """ + + if get_tesseract_version() < TESSERACT_ALTO_VERSION: + raise ALTONotSupported() + + config = f'-c tessedit_create_alto=1 {config.strip()}' + args = [image, 'xml', lang, config, nice, timeout, True] + + return run_and_get_output(*args) + + +def image_to_boxes( + image, + lang=None, + config='', + nice=0, + output_type=Output.STRING, + timeout=0, +): + """ + Returns string containing recognized characters and their box boundaries + """ + config = f'{config.strip()} batch.nochop makebox' + args = [image, 'box', lang, config, nice, timeout] + + return { + Output.BYTES: lambda: run_and_get_output(*(args + [True])), + Output.DICT: lambda: file_to_dict( + f'char left bottom right top page\n{run_and_get_output(*args)}', + ' ', + 0, + ), + Output.STRING: lambda: run_and_get_output(*args), + }[output_type]() + + +def get_pandas_output(args, config=None): + if not pandas_installed: + raise PandasNotSupported() + + kwargs = {'quoting': QUOTE_NONE, 'sep': '\t'} + try: + kwargs.update(config) + except (TypeError, ValueError): + pass + + return pd.read_csv(BytesIO(run_and_get_output(*args)), **kwargs) + + +def image_to_data( + image, + lang=None, + config='', + nice=0, + output_type=Output.STRING, + timeout=0, + pandas_config=None, +): + """ + Returns string containing box boundaries, confidences, + and other information. Requires Tesseract 3.05+ + """ + + if get_tesseract_version() < TESSERACT_MIN_VERSION: + raise TSVNotSupported() + + config = f'-c tessedit_create_tsv=1 {config.strip()}' + args = [image, 'tsv', lang, config, nice, timeout] + + return { + Output.BYTES: lambda: run_and_get_output(*(args + [True])), + Output.DATAFRAME: lambda: get_pandas_output( + args + [True], + pandas_config, + ), + Output.DICT: lambda: file_to_dict(run_and_get_output(*args), '\t', -1), + Output.STRING: lambda: run_and_get_output(*args), + }[output_type]() + + +def image_to_osd( + image, + lang='osd', + config='', + nice=0, + output_type=Output.STRING, + timeout=0, +): + """ + Returns string containing the orientation and script detection (OSD) + """ + config = f'--psm 0 {config.strip()}' + args = [image, 'osd', lang, config, nice, timeout] + + return { + Output.BYTES: lambda: run_and_get_output(*(args + [True])), + Output.DICT: lambda: osd_to_dict(run_and_get_output(*args)), + Output.STRING: lambda: run_and_get_output(*args), + }[output_type]() + + +def main(): + if len(sys.argv) == 2: + filename, lang = sys.argv[1], None + elif len(sys.argv) == 4 and sys.argv[1] == '-l': + filename, lang = sys.argv[3], sys.argv[2] + else: + print('Usage: pytesseract [-l lang] input_file\n', file=sys.stderr) + return 2 + + try: + with Image.open(filename) as img: + print(image_to_string(img, lang=lang)) + except TesseractNotFoundError as e: + print(f'{str(e)}\n', file=sys.stderr) + return 1 + except OSError as e: + print(f'{type(e).__name__}: {e}', file=sys.stderr) + return 1 + + +if __name__ == '__main__': + exit(main()) \ No newline at end of file