Improve force ocr, enable parallel factor below 1

VikParuchuri · Dec 18, 2023 · b10899b · b10899b
1 parent 844833f
commit b10899b
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -88,6 +88,7 @@ First, clone the repo:
 - Install python requirements
   - `poetry install`
   - `poetry shell` to activate your poetry venv
+- On ARM macs (M1+), make sure to set the `TORCH_DEVICE` setting to `mps` (more details below) for a speedup
 
 # Usage
 

diff --git a/marker/convert.py b/marker/convert.py
@@ -92,7 +92,7 @@ def convert_single_pdf(
         tess_lang,
         spell_lang,
         max_pages=max_pages,
-        parallel=parallel_factor * settings.OCR_PARALLEL_WORKERS
+        parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
     )
 
     out_meta["toc"] = toc
@@ -109,7 +109,7 @@ def convert_single_pdf(
         doc,
         blocks,
         layoutlm_model,
-        batch_size=settings.LAYOUT_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
     )
 
     # Find headers and footers
@@ -125,7 +125,7 @@ def convert_single_pdf(
         doc,
         blocks,
         order_model,
-        batch_size=settings.ORDERER_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
     )
 
     # Fix code blocks
@@ -148,7 +148,7 @@ def convert_single_pdf(
         blocks,
         block_types,
         nougat_model,
-        batch_size=settings.NOUGAT_BATCH_SIZE * parallel_factor
+        batch_size=int(settings.NOUGAT_BATCH_SIZE * parallel_factor)
     )
     out_meta["block_stats"]["equations"] = eq_stats
 

diff --git a/marker/debug/data.py b/marker/debug/data.py
@@ -11,7 +11,7 @@
 
 
 def dump_nougat_debug_data(doc, images, converted_spans):
-    if not settings.DEBUG_DATA_FOLDER:
+    if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
         return
 
     if len(images) == 0:
@@ -44,7 +44,7 @@ def dump_nougat_debug_data(doc, images, converted_spans):
 
 
 def dump_bbox_debug_data(doc, blocks: List[Page]):
-    if not settings.DEBUG_DATA_FOLDER:
+    if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
         return
 
     # Remove extension from doc name

diff --git a/marker/ocr/page.py b/marker/ocr/page.py
@@ -53,7 +53,8 @@ def ocr_entire_page_ocrmp(page, lang: str, spellchecker: Optional[SpellChecker]
         outbytes,
         language=lang,
         output_type="pdf",
-        redo_ocr=True,
+        redo_ocr=None if settings.OCR_ALL_PAGES else True,
+        force_ocr=True if settings.OCR_ALL_PAGES else None,
         progress_bar=False,
         optimize=False,
         fast_web_view=1e6,

diff --git a/marker/settings.py b/marker/settings.py
@@ -37,6 +37,7 @@ class Settings(BaseSettings):
         "French": "fra",
         "German": "deu",
         "Russian": "rus",
+        "Chinese": "chi_sim",
     }
     TESSERACT_TIMEOUT: int = 20 # When to give up on OCR
     SPELLCHECK_LANGUAGES: Dict = {
@@ -46,6 +47,7 @@ class Settings(BaseSettings):
         "French": "fr",
         "German": "de",
         "Russian": "ru",
+        "Chinese": None
     }
     OCR_ALL_PAGES: bool = False # Run OCR on every page even if text can be extracted
     OCR_PARALLEL_WORKERS: int = 2 # How many CPU workers to use for OCR
@@ -101,6 +103,7 @@ class Settings(BaseSettings):
     # Debug
     DEBUG: bool = False # Enable debug logging
     DEBUG_DATA_FOLDER: Optional[str] = None
+    DEBUG_LEVEL: int = 0 # 0 to 2, 2 means log everything
 
     @computed_field
     @property

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,9 +1,12 @@
 [tool.poetry]
 name = "marker"
 version = "0.1.0"
-description = ""
-authors = ["Vik Paruchuri <[email protected]>"]
+description = "Convert PDF to markdown with high speed and accuracy."
+authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
+license = "GPL-3.0-or-later"
+repository = "https://github.com/VikParuchuri/marker"
+keywords = ["pdf", "markdown", "ocr", "nlp"]
 
 [tool.poetry.dependencies]
 python = ">=3.9,<3.13"
@@ -29,6 +32,7 @@ ftfy = "^6.1.1"
 nltk = "^3.8.1"
 ocrmypdf = "^15.4.0"
 bitsandbytes = "^0.41.2.post2"
+grpcio = "^1.60.0"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"

diff --git a/scripts/markdown_to_pdf.sh b/scripts/markdown_to_pdf.sh
@@ -7,4 +7,4 @@ if [ $# -ne 2 ]; then
     exit 1
 fi
 
-pandoc $1 $2 --pdf-engine=xelatex --include-in-header=header.tex
+pandoc $1 -o $2 --pdf-engine=xelatex --include-in-header=header.tex