diff --git a/docs/release_notes.rst b/docs/release_notes.rst index be34248bb..fc16485ea 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -31,9 +31,11 @@ tagged yet. v15.4.0 ======= -- Added new APIs to support offline editing of the final text. Specifically, - one can now generate hOCR files with OCRmyPDF, edit them with some other tool, - and then finalize the PDF. +- Added new experimental APIs to support offline editing of the final text. + Specifically, one can now generate hOCR files with OCRmyPDF, edit them with + some other tool, and then finalize the PDF. They are experimental and + subject to change, including details of how the working folder is used. + There is no command line interface. - Code reorganization: executors, progress bars, initialization and setup. - Fixed test coverage in cases where the coverage tool did not properly trace into threads or subprocesses. This code was still being tested but appeared diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py index 5905aa58c..474619a61 100644 --- a/src/ocrmypdf/__init__.py +++ b/src/ocrmypdf/__init__.py @@ -14,7 +14,11 @@ configure_debug_logging, ) from ocrmypdf._version import PROGRAM_NAME, __version__ -from ocrmypdf.api import Verbosity, configure_logging, hocr_to_ocr_pdf, ocr, pdf_to_hocr +from ocrmypdf.api import ( + Verbosity, + configure_logging, + ocr, +) from ocrmypdf.exceptions import ( BadArgsError, DpiError, @@ -45,7 +49,6 @@ 'ExitCode', 'ExitCodeException', 'helpers', - 'hocr_to_ocr_pdf', 'hocrtransform', 'hookimpl', 'InputFileError', @@ -55,7 +58,6 @@ 'OrientationConfidence', 'OutputFileAccessError', 'PageContext', - 'pdf_to_hocr', 'pdfa', 'PdfContext', 'pdfinfo', diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 98e22f680..9d78fe574 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -375,7 +375,7 @@ def ocr( # noqa: D417 return run_pipeline(options=options, plugin_manager=plugin_manager) -def pdf_to_hocr( # noqa: D417 +def _pdf_to_hocr( # noqa: D417 input_pdf: Path, output_folder: Path, *, @@ -432,6 +432,8 @@ def pdf_to_hocr( # noqa: D417 For arguments not explicitly documented here, see documentation for the equivalent command line parameter. + This API is **experimental** and subject to change. + Args: input_pdf: Input PDF file path. output_folder: Output folder path. @@ -468,7 +470,7 @@ def pdf_to_hocr( # noqa: D417 return run_hocr_pipeline(options=options, plugin_manager=plugin_manager) -def hocr_to_ocr_pdf( # noqa: D417 +def _hocr_to_ocr_pdf( # noqa: D417 work_folder: Path, output_file: Path, *, @@ -496,6 +498,8 @@ def hocr_to_ocr_pdf( # noqa: D417 For arguments not explicitly documented here, see documentation for the equivalent command line parameter. + This API is **experimental** and subject to change. + Args: work_folder: Work folder path, as generated by :func:`pdf_to_hocr`. output_file: Output PDF file path. @@ -543,8 +547,6 @@ def hocr_to_ocr_pdf( # noqa: D417 'get_parser', 'get_plugin_manager', 'ocr', - 'pdf_to_hocr', 'run_pipeline', 'run_pipeline_cli', - 'hocr_to_ocr_pdf', ] diff --git a/tests/test_api.py b/tests/test_api.py index 4e92c7194..540d4ac08 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -10,6 +10,7 @@ from pdfminer.high_level import extract_text import ocrmypdf +import ocrmypdf.api def test_language_list(): @@ -29,7 +30,7 @@ def test_stream_api(resources: Path): def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path): - ocrmypdf.pdf_to_hocr( + ocrmypdf.api._pdf_to_hocr( resources / 'multipage.pdf', outdir, language='eng', @@ -40,12 +41,12 @@ def test_hocr_api_multipage(resources: Path, outdir: Path, outpdf: Path): assert (outdir / '000006_ocr_hocr.hocr').exists() assert not (outdir / '000004_ocr_hocr.hocr').exists() - ocrmypdf.hocr_to_ocr_pdf(outdir, outpdf) + ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf) assert outpdf.exists() def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path): - ocrmypdf.pdf_to_hocr( + ocrmypdf.api._pdf_to_hocr( resources / 'ccitt.pdf', outdir, language='eng', @@ -57,7 +58,7 @@ def test_hocr_to_pdf_api(resources: Path, outdir: Path, outpdf: Path): mangled = hocr.replace('the', 'hocr') (outdir / '000001_ocr_hocr.hocr').write_text(mangled, encoding='utf-8') - ocrmypdf.hocr_to_ocr_pdf(outdir, outpdf, optimize=0) + ocrmypdf.api._hocr_to_ocr_pdf(outdir, outpdf, optimize=0) text = extract_text(outpdf) assert 'hocr' in text and 'the' not in text